-
Notifications
You must be signed in to change notification settings - Fork 2.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Qunsong/qa eval groundedness service (#34797)
* modify built-in qa evaluation flow * move groundedness evaluation to groundedness evaluation service * change content harm level "Safe" to "Very low" * add default groundedness results to qa eval flow * modify built-in qa evaluation flow: * add logic to check service availability in a region * change hate_fairness to hate_unfairness * add gpt_groundedness to qa node list * update built-in qa evaluation flow * add flight control to flow input * code flake8 cleaning * round f1_score in built_in qa eval flow * metric name update * update e2e test of qa built-in evaluation flow * update built-in qa evaluation flow * fix fallback check logic in validate_groundedness_service * add e2e test of fallback groundedness
- Loading branch information
Showing
25 changed files
with
657 additions
and
296 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2 changes: 1 addition & 1 deletion
2
sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_local_flow_handler.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2 changes: 2 additions & 0 deletions
2
sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_metric_handler.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
9 changes: 9 additions & 0 deletions
9
...zure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/call_groundedness_service.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
from promptflow import tool | ||
from rai_client import RAIServiceHandler | ||
|
||
|
||
@tool | ||
def call_groundedness_service(request_body: dict) -> [dict]: | ||
service_handler = RAIServiceHandler() | ||
annotation_results = service_handler.get_annotation(request_body) | ||
return annotation_results |
77 changes: 4 additions & 73 deletions
77
...erative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/call_rai_service.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,78 +1,9 @@ | ||
from promptflow import tool | ||
from mlflow.utils.rest_utils import http_request | ||
import time | ||
from utils import get_cred | ||
from constants import RAIService | ||
from rai_client import RAIServiceHandler | ||
|
||
|
||
def submit_annotation(cred, request_body): | ||
try: | ||
response = http_request( | ||
host_creds=cred, | ||
endpoint="/submitannotation", | ||
method="POST", | ||
json=request_body, | ||
) | ||
|
||
if response.status_code != 202: | ||
print("Fail evaluating '%s' with error message: %s" %(request_body["UserTextList"], response.text)) | ||
response.raise_for_status() | ||
except AttributeError as e: | ||
response = None | ||
print("Fail evaluating '%s' with error message: %s" % (request_body["UserTextList"], e)) | ||
if response is not None: | ||
json_obj = response.json() | ||
else: | ||
json_obj = {} | ||
return json_obj | ||
|
||
def check_status(cred, request_id): | ||
try: | ||
response = http_request( | ||
host_creds = cred, | ||
endpoint="/operations/" + request_id, | ||
method="GET" | ||
) | ||
except AttributeError as e: | ||
response = None | ||
return response | ||
|
||
def retrieve_annotation_result(cred, submitannotation_response): | ||
request_id = submitannotation_response["location"].split("/")[-1] | ||
annotation_result = None | ||
start = time.time() | ||
time_elapsed = 0 | ||
request_count = 1 | ||
while True and time_elapsed <= RAIService.TIMEOUT: | ||
try: | ||
request_status = check_status(cred, request_id) | ||
except Exception: | ||
request_status = None | ||
if request_status: | ||
request_status_code = request_status.status_code | ||
if request_status_code == 200: | ||
annotation_result = request_status.json() | ||
break | ||
else: | ||
print("Failed to retrieve the status of RequestID: %s" % request_id) | ||
request_count += 1 | ||
sleep_time = RAIService.SLEEPTIME ** request_count | ||
time.sleep(sleep_time) | ||
time_elapsed = time.time() - start | ||
|
||
if time_elapsed > RAIService.TIMEOUT: | ||
raise TimeoutError("Request times out after %d seconds", RAIService.TIMEOUT) | ||
|
||
return annotation_result | ||
|
||
# The inputs section will change based on the arguments of the tool function, after you save the code | ||
# Adding type to arguments and return value will help the system show the types properly | ||
# Please update the function name/signature per need | ||
@tool | ||
def call_rai_service(request_body: dict) -> dict: | ||
#rai = RAIService() | ||
cred = get_cred() | ||
submitannotation_response = submit_annotation(cred, request_body) | ||
annotation_result = retrieve_annotation_result(cred, submitannotation_response) | ||
return annotation_result | ||
|
||
service_handler = RAIServiceHandler() | ||
annotation_results = service_handler.get_annotation(request_body) | ||
return annotation_results |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
21 changes: 21 additions & 0 deletions
21
...ai/generative/evaluate/pf_templates/built_in_metrics/qa/construct_groundedness_request.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
from promptflow import tool | ||
import json | ||
|
||
|
||
def normalize_user_text(user_text): | ||
return user_text.replace("'", "\\\"") | ||
|
||
|
||
@tool | ||
def construct_request(question: str, | ||
answer: str, | ||
context: str) -> dict: | ||
metrics = ["generic_groundedness"] | ||
user_text = json.dumps({"question": question, | ||
"answer": answer, | ||
"context": context}) | ||
parsed_user_text = normalize_user_text(user_text) | ||
request_body = {"UserTextList": [parsed_user_text], | ||
"AnnotationTask": "groundedness", | ||
"MetricList": metrics} | ||
return request_body |
17 changes: 11 additions & 6 deletions
17
...zure/ai/generative/evaluate/pf_templates/built_in_metrics/qa/construct_service_request.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,21 @@ | ||
from promptflow import tool | ||
|
||
|
||
def normalize_user_text(user_text): | ||
return user_text.replace("'", "\\\"") | ||
|
||
# The inputs section will change based on the arguments of the tool function, after you save the code | ||
# Adding type to arguments and return value will help the system show the types properly | ||
# Please update the function name/signature per need | ||
|
||
@tool | ||
def construct_request(question: str, answer:str, selected_metrics: dict) -> dict: | ||
def construct_request(question: str, | ||
answer: str, | ||
selected_metrics: dict) -> dict: | ||
selected_safety_metrics = selected_metrics["safety_metrics"] | ||
metrics = [metric for metric in selected_safety_metrics if selected_safety_metrics[metric]] | ||
metrics = [metric.replace("_unfairness", "_fairness") for metric in | ||
selected_safety_metrics if selected_safety_metrics[metric]] | ||
user_text = f"<Human>{question}</><System>{answer}</>" | ||
parsed_user_text = normalize_user_text(user_text) | ||
request_body = {"UserTextList": [parsed_user_text], "AnnotationTask": "content harm", "MetricList":metrics} | ||
request_body = {"UserTextList": [parsed_user_text], | ||
"AnnotationTask": "content harm", | ||
"MetricList": metrics, | ||
"PromptVersion": "0.2"} | ||
return request_body |
Oops, something went wrong.