Skip to content

Commit

Permalink
update built in flows pupr 0325 (#34951)
Browse files Browse the repository at this point in the history
* update built-in chat flow:

* add more logging in validate_service
* fix parsing error in parse_groundedness_responses.py

* update built-in qa flow

* update qa groundedness input validation logic
* add logging in validate_groundedness_service
  • Loading branch information
qusongms authored Mar 27, 2024
1 parent c105d6f commit 555560a
Show file tree
Hide file tree
Showing 8 changed files with 49 additions and 39 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
$schema: https://azuremlschemas.azureedge.net/latest/flow.schema.json
name: template_eval_flow
display_name: Template Evaluation Flow
name: chat_quality_safety_eval
display_name: Chat Quality Safety Evaluation
type: evaluate
path: ./flow.dag.yaml
description: Template Evaluation Flow
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,10 @@ def parse_single_response(response: dict) -> list:
else:
metric_value = np.nan
reasoning = ""
parsed_harm_response[harm_type] = float(metric_value)
try:
parsed_harm_response[harm_type] = float(metric_value)
except Exception:
parsed_harm_response[harm_type] = np.nan
parsed_harm_response[harm_type + "_reason"] = reasoning
parsed_response.append(parsed_harm_response)
return parsed_response
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def is_service_available(flight: bool):

if response.status_code != 200:
print("Fail to get RAI service availability in this region.")
print(response.status_code)
print("Response_code: %d" % response.status_code)
else:
available_service = response.json()
if "content harm" in available_service:
Expand All @@ -27,10 +27,12 @@ def is_service_available(flight: bool):
print("RAI service is not available in this region.")
if "groundedness" in available_service and flight:
groundedness_service = True
else:
if not flight:
print("GroundednessServiceFlight is off.")
if "groundedness" not in available_service:
print("AACS service is not available in this region.")
except Exception:
print("Fail to get RAI service availability in this region.")
print("Failed to call checkannotation endpoint.")
return {"content_harm_service": content_harm_service,
"groundedness_service": groundedness_service
}
Expand All @@ -53,6 +55,8 @@ def is_safety_metrics_selected(selected_metrics):


def is_groundedness_metric_selected(selected_metrics: dict) -> bool:
if not selected_metrics["rag_metrics"]["gpt_groundedness"]:
print("gpt_groundedness is not selected.")
return selected_metrics["rag_metrics"]["gpt_groundedness"]


Expand All @@ -75,8 +79,8 @@ def validate_safety_metric_input(
chat: [dict],
validate_chat_result: dict,
flight: bool = True) -> dict:
service_available = is_service_available(flight)
tracking_uri_set = is_tracking_uri_set()
service_available = is_service_available(flight)
valid_chat = is_chat_valid(chat)
groundedness_selected = is_groundedness_metric_selected(selected_metrics)
content_harm_service = is_safety_metrics_selected(selected_metrics) \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ def normalize_user_text(user_text):


@tool
def construct_request(question: str,
answer: str,
context: str) -> dict:
def construct_request(answer: str,
context: str,
question: str = "") -> dict:
metrics = ["generic_groundedness"]
user_text = json.dumps({"question": question,
"answer": answer,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -295,10 +295,7 @@ nodes:
type: code
path: validate_groundedness_service.py
inputs:
answer: ${inputs.answer}
context: ${inputs.context}
flight: ${inputs.groundedness_service_flight}
question: ${inputs.question}
selected_metrics: ${select_metrics.output}
validate_input_result: ${validate_input.output}
use_variants: false
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from promptflow import tool
import mlflow
from mlflow.utils.rest_utils import http_request
from utils import get_cred, is_valid_string
from utils import get_cred


def is_service_available(flight: bool):
Expand All @@ -18,19 +18,23 @@ def is_service_available(flight: bool):

if response.status_code != 200:
print("Fail to get RAI service availability in this region.")
print(response.status_code)
print("Response_code: %d" % response.status_code)
else:
available_service = response.json()
# check if content harm service is avilable
if "content harm" in available_service:
content_harm_service = True
else:
print("Content harm service is not available in this region.")
# check if groundedness service is avilable
if "groundedness" in available_service and flight:
groundedness_service = True
else:
if not flight:
print("GroundednessServiceFlight is off.")
if "groundedness" not in available_service:
print("AACS service is not available in this region.")
except Exception:
print("Fail to get RAI service availability in this region.")
print("Failed to call checkannotation endpoint.")
return {"content_harm_service": content_harm_service,
"groundedness_service": groundedness_service
}
Expand All @@ -54,44 +58,46 @@ def is_safety_metric_selected(selected_metrics: dict) -> bool:


def is_groundedness_metric_selected(selected_metrics: dict) -> bool:
if not selected_metrics["quality_metrics"]["gpt_groundedness"]:
print("gpt_groundedness is not selected.")
return selected_metrics["quality_metrics"]["gpt_groundedness"]


def is_input_valid_for_safety_metrics(question: str, answer: str):
if is_valid_string(question) and is_valid_string(answer):
return True
else:
print("Input is not valid for safety metrics evaluation")
return False


# check if RAI service is available in this region. If not, return False.
# check if RAI service is avilable in this region. If not, return False.
# check if tracking_uri is set. If not, return False
# if tracking_rui is set, check if any safety metric is selected.
# if no safety metric is selected, return False
@tool
def validate_safety_metric_input(
selected_metrics: dict,
validate_input_result: dict,
question: str,
answer: str,
flight: bool = True,
context: str = None) -> dict:
service_available = is_service_available(flight)
) -> dict:
tracking_uri_set = is_tracking_uri_set()
service_available = is_service_available(flight)
safety_metrics_selected = is_safety_metric_selected(selected_metrics)
gpt_groundedness_selected = is_groundedness_metric_selected(
selected_metrics)

content_harm_service = is_safety_metric_selected(selected_metrics) \
content_harm_service = safety_metrics_selected \
and service_available["content_harm_service"] and tracking_uri_set \
and validate_input_result["safety_metrics"]

groundedness_service = is_groundedness_metric_selected(selected_metrics)\
groundedness_service = gpt_groundedness_selected\
and validate_input_result["gpt_groundedness"] and tracking_uri_set \
and service_available["groundedness_service"]

groundedness_prompt = is_groundedness_metric_selected(selected_metrics) \
and validate_input_result["gpt_groundedness"] \
groundedness_prompt = gpt_groundedness_selected \
and validate_input_result["gpt_groundedness"] \
and (not service_available["groundedness_service"])

if not validate_input_result["gpt_groundedness"] \
and gpt_groundedness_selected:
print("Input for gpt_groundedness is not valid")

if not validate_input_result["safety_metrics"] and safety_metrics_selected:
print("Input for safety metrics evaluation is not valid")

return {"content_harm_service": content_harm_service,
"groundedness_service": groundedness_service,
"groundedness_prompt": groundedness_prompt
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def is_input_valid_for_safety_metrics(
if is_valid_string(question) and is_valid_string(answer):
return True
else:
print("Input is not valid for safety metrics evaluation")
print("Input for safety metrics evaluation is not valid")
return False


Expand All @@ -23,8 +23,7 @@ def validate_input(question: str,
"ground_truth": ground_truth}
expected_input_cols = set(input_data.keys())
dict_metric_required_fields = {
"gpt_groundedness": set(["question",
"answer",
"gpt_groundedness": set(["answer",
"context"]),
"gpt_relevance": set(["question",
"answer",
Expand All @@ -49,7 +48,7 @@ def validate_input(question: str,
if metric_required_fields <= actual_input_cols:
data_validation[metric] = True
else:
print("input for %s is not valid" % metric)
print("Input for %s is not valid." % metric)

safety_metrics = is_input_valid_for_safety_metrics(question, answer)
data_validation["safety_metrics"] = safety_metrics
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def test_evaluate_built_in_qa_fallback_groundedness(self, e2e_openai_api_base, e
e2e_openai_completion_deployment_name, tmpdir):
test_data = [
{"context": "Some are reported as not having been wanted at all.",
"question": "are all reported as being wanted?",
"question": "",
"answer": "All are reported as being completely and fully wanted."
},
{"question": "How do you log a model?",
Expand Down Expand Up @@ -315,6 +315,7 @@ def test_task_type_chat(self, ai_client, e2e_openai_api_base, e2e_openai_api_key
assert "gpt_groundedness" in columns_in_tabular_data
assert "gpt_retrieval_score" in columns_in_tabular_data
assert "evaluation_per_turn" in columns_in_tabular_data
assert "messages" in columns_in_tabular_data

def test_task_type_chat_fallback_groundedness(self, ai_client, e2e_openai_api_base, e2e_openai_api_key, e2e_openai_completion_deployment_name, tmpdir):
data_path = os.path.join(pathlib.Path(__file__).parent.parent.resolve(), "data")
Expand Down

0 comments on commit 555560a

Please sign in to comment.