Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update built in flows pupr 0325 #34951

Merged
merged 2 commits into from
Mar 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
$schema: https://azuremlschemas.azureedge.net/latest/flow.schema.json
name: template_eval_flow
display_name: Template Evaluation Flow
name: chat_quality_safety_eval
display_name: Chat Quality Safety Evaluation
type: evaluate
path: ./flow.dag.yaml
description: Template Evaluation Flow
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,10 @@ def parse_single_response(response: dict) -> list:
else:
metric_value = np.nan
reasoning = ""
parsed_harm_response[harm_type] = float(metric_value)
try:
parsed_harm_response[harm_type] = float(metric_value)
except Exception:
parsed_harm_response[harm_type] = np.nan
parsed_harm_response[harm_type + "_reason"] = reasoning
parsed_response.append(parsed_harm_response)
return parsed_response
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def is_service_available(flight: bool):

if response.status_code != 200:
print("Fail to get RAI service availability in this region.")
print(response.status_code)
print("Response_code: %d" % response.status_code)
else:
available_service = response.json()
if "content harm" in available_service:
Expand All @@ -27,10 +27,12 @@ def is_service_available(flight: bool):
print("RAI service is not available in this region.")
if "groundedness" in available_service and flight:
groundedness_service = True
else:
if not flight:
print("GroundednessServiceFlight is off.")
if "groundedness" not in available_service:
print("AACS service is not available in this region.")
except Exception:
print("Fail to get RAI service availability in this region.")
print("Failed to call checkannotation endpoint.")
return {"content_harm_service": content_harm_service,
"groundedness_service": groundedness_service
}
Expand All @@ -53,6 +55,8 @@ def is_safety_metrics_selected(selected_metrics):


def is_groundedness_metric_selected(selected_metrics: dict) -> bool:
if not selected_metrics["rag_metrics"]["gpt_groundedness"]:
print("gpt_groundedness is not selected.")
return selected_metrics["rag_metrics"]["gpt_groundedness"]


Expand All @@ -75,8 +79,8 @@ def validate_safety_metric_input(
chat: [dict],
validate_chat_result: dict,
flight: bool = True) -> dict:
service_available = is_service_available(flight)
tracking_uri_set = is_tracking_uri_set()
service_available = is_service_available(flight)
valid_chat = is_chat_valid(chat)
groundedness_selected = is_groundedness_metric_selected(selected_metrics)
content_harm_service = is_safety_metrics_selected(selected_metrics) \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ def normalize_user_text(user_text):


@tool
def construct_request(question: str,
answer: str,
context: str) -> dict:
def construct_request(answer: str,
context: str,
question: str = "") -> dict:
metrics = ["generic_groundedness"]
user_text = json.dumps({"question": question,
"answer": answer,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -295,10 +295,7 @@ nodes:
type: code
path: validate_groundedness_service.py
inputs:
answer: ${inputs.answer}
context: ${inputs.context}
flight: ${inputs.groundedness_service_flight}
question: ${inputs.question}
selected_metrics: ${select_metrics.output}
validate_input_result: ${validate_input.output}
use_variants: false
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from promptflow import tool
import mlflow
from mlflow.utils.rest_utils import http_request
from utils import get_cred, is_valid_string
from utils import get_cred


def is_service_available(flight: bool):
Expand All @@ -18,19 +18,23 @@ def is_service_available(flight: bool):

if response.status_code != 200:
print("Fail to get RAI service availability in this region.")
print(response.status_code)
print("Response_code: %d" % response.status_code)
else:
available_service = response.json()
# check if content harm service is avilable
if "content harm" in available_service:
content_harm_service = True
else:
print("Content harm service is not available in this region.")
# check if groundedness service is avilable
if "groundedness" in available_service and flight:
groundedness_service = True
else:
if not flight:
print("GroundednessServiceFlight is off.")
if "groundedness" not in available_service:
print("AACS service is not available in this region.")
except Exception:
print("Fail to get RAI service availability in this region.")
print("Failed to call checkannotation endpoint.")
return {"content_harm_service": content_harm_service,
"groundedness_service": groundedness_service
}
Expand All @@ -54,44 +58,46 @@ def is_safety_metric_selected(selected_metrics: dict) -> bool:


def is_groundedness_metric_selected(selected_metrics: dict) -> bool:
if not selected_metrics["quality_metrics"]["gpt_groundedness"]:
print("gpt_groundedness is not selected.")
return selected_metrics["quality_metrics"]["gpt_groundedness"]


def is_input_valid_for_safety_metrics(question: str, answer: str):
if is_valid_string(question) and is_valid_string(answer):
return True
else:
print("Input is not valid for safety metrics evaluation")
return False


# check if RAI service is available in this region. If not, return False.
# check if RAI service is avilable in this region. If not, return False.
# check if tracking_uri is set. If not, return False
# if tracking_rui is set, check if any safety metric is selected.
# if no safety metric is selected, return False
@tool
def validate_safety_metric_input(
selected_metrics: dict,
validate_input_result: dict,
question: str,
answer: str,
flight: bool = True,
context: str = None) -> dict:
service_available = is_service_available(flight)
) -> dict:
tracking_uri_set = is_tracking_uri_set()
service_available = is_service_available(flight)
safety_metrics_selected = is_safety_metric_selected(selected_metrics)
gpt_groundedness_selected = is_groundedness_metric_selected(
selected_metrics)

content_harm_service = is_safety_metric_selected(selected_metrics) \
content_harm_service = safety_metrics_selected \
and service_available["content_harm_service"] and tracking_uri_set \
and validate_input_result["safety_metrics"]

groundedness_service = is_groundedness_metric_selected(selected_metrics)\
groundedness_service = gpt_groundedness_selected\
and validate_input_result["gpt_groundedness"] and tracking_uri_set \
and service_available["groundedness_service"]

groundedness_prompt = is_groundedness_metric_selected(selected_metrics) \
and validate_input_result["gpt_groundedness"] \
groundedness_prompt = gpt_groundedness_selected \
and validate_input_result["gpt_groundedness"] \
and (not service_available["groundedness_service"])

if not validate_input_result["gpt_groundedness"] \
and gpt_groundedness_selected:
print("Input for gpt_groundedness is not valid")

if not validate_input_result["safety_metrics"] and safety_metrics_selected:
print("Input for safety metrics evaluation is not valid")

return {"content_harm_service": content_harm_service,
"groundedness_service": groundedness_service,
"groundedness_prompt": groundedness_prompt
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def is_input_valid_for_safety_metrics(
if is_valid_string(question) and is_valid_string(answer):
return True
else:
print("Input is not valid for safety metrics evaluation")
print("Input for safety metrics evaluation is not valid")
return False


Expand All @@ -23,8 +23,7 @@ def validate_input(question: str,
"ground_truth": ground_truth}
expected_input_cols = set(input_data.keys())
dict_metric_required_fields = {
"gpt_groundedness": set(["question",
"answer",
"gpt_groundedness": set(["answer",
"context"]),
"gpt_relevance": set(["question",
"answer",
Expand All @@ -49,7 +48,7 @@ def validate_input(question: str,
if metric_required_fields <= actual_input_cols:
data_validation[metric] = True
else:
print("input for %s is not valid" % metric)
print("Input for %s is not valid." % metric)

safety_metrics = is_input_valid_for_safety_metrics(question, answer)
data_validation["safety_metrics"] = safety_metrics
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def test_evaluate_built_in_qa_fallback_groundedness(self, e2e_openai_api_base, e
e2e_openai_completion_deployment_name, tmpdir):
test_data = [
{"context": "Some are reported as not having been wanted at all.",
"question": "are all reported as being wanted?",
"question": "",
"answer": "All are reported as being completely and fully wanted."
},
{"question": "How do you log a model?",
Expand Down Expand Up @@ -315,6 +315,7 @@ def test_task_type_chat(self, ai_client, e2e_openai_api_base, e2e_openai_api_key
assert "gpt_groundedness" in columns_in_tabular_data
assert "gpt_retrieval_score" in columns_in_tabular_data
assert "evaluation_per_turn" in columns_in_tabular_data
assert "messages" in columns_in_tabular_data

def test_task_type_chat_fallback_groundedness(self, ai_client, e2e_openai_api_base, e2e_openai_api_key, e2e_openai_completion_deployment_name, tmpdir):
data_path = os.path.join(pathlib.Path(__file__).parent.parent.resolve(), "data")
Expand Down