Merge pull request #110 from ks6088ts-labs/feature/issue-109_promptfl…

…ow-evaluation hands on evaluators with prompt flow sdk
ks6088ts-labs · Sep 1, 2024 · 43beadb · 43beadb
2 parents 02c34a1 + 49e6683
commit 43beadb
Show file tree

Hide file tree

Showing 9 changed files with 275 additions and 6 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -13,4 +13,4 @@ repos:
     rev: 24.2.0
     hooks:
     -   id: black
-exclude: 'generated/.*|artifacts/.*|.jsonl|.csv|.json'
+exclude: 'generated/.*|artifacts/.*|.jsonl|.csv|.json|.prompty'
diff --git a/apps/11_promptflow/README.md b/apps/11_promptflow/README.md
@@ -273,6 +273,15 @@ $ pf run create \
 $ pf run show-details --name $RUN_NAME
 ```
 
+### evaluators
+
+To guide you through working with evaluators, a helpful document is available at [Evaluate with the prompt flow SDK](https://learn.microsoft.com/azure/ai-studio/how-to/develop/flow-evaluate-sdk).
+
+```shell
+# Show help
+python apps/11_promptflow/evaluators/main.py --help
+```
+
 <!-- TODO: rag, deployments -->
 
 ## References

diff --git a/apps/11_promptflow/evaluators/.gitignore b/apps/11_promptflow/evaluators/.gitignore
@@ -0,0 +1 @@
+results.json
diff --git a/apps/11_promptflow/evaluators/apology.prompty b/apps/11_promptflow/evaluators/apology.prompty
@@ -0,0 +1,25 @@
+---
+name: Apology Evaluator
+description: Apology Evaluator for QA scenario
+model:
+  api: chat
+  configuration:
+    type: azure_openai
+    connection: open_ai_connection
+    azure_deployment: gpt-4
+  parameters:
+    temperature: 0.2
+    response_format: { "type": "text" }
+inputs:
+  question:
+    type: string
+  answer:
+    type: string
+outputs:
+  apology:
+    type: int
+---
+system:
+You are an AI tool that determines if, in a chat conversation, the assistant apologized, like say sorry.
+Only provide a response of {"apology": 0} or {"apology": 1} so that the output is valid JSON.
+Give a apology of 1 if apologized in the chat conversation.
diff --git a/apps/11_promptflow/evaluators/data.jsonl b/apps/11_promptflow/evaluators/data.jsonl
@@ -0,0 +1,2 @@
+{"answer": "Paris is the capital of France.", "context": "France is in Europe", "ground_truth": "Paris has been the capital of France since the 10th century and is known for its cultural and historical landmarks.", "question": "What is the capital of France?"}
+{"answer": "I'm sorry, I don't know that. Would you like me to look it up for you?", "context": "Fixing car isn't supported.", "ground_truth": "We don't support car fixing service. I'm sorry, I don't know that. Would you like me to look it up for you?", "question": "Where can I get my car fixed?"}
diff --git a/apps/11_promptflow/evaluators/main.py b/apps/11_promptflow/evaluators/main.py
@@ -0,0 +1,135 @@
+import argparse
+import logging
+from enum import Enum
+from os import getenv
+from pathlib import Path
+
+from dotenv import load_dotenv
+from promptflow.client import load_flow
+from promptflow.core import AzureOpenAIModelConfiguration
+from promptflow.evals.evaluate import evaluate
+from promptflow.evals.evaluators import RelevanceEvaluator
+
+BASE_DIR = Path(__file__).absolute().parent
+
+
+class EvaluatorType(Enum):
+    RELEVANCE = "relevance"
+    ANSWER_LENGTH = "answer_length"
+    APOLOGY = "apology"
+    DATASET = "dataset"
+
+
+def init_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        prog="run_evaluators",
+        description="Evaluate with the prompt flow SDK",
+    )
+    parser.add_argument(
+        "-t",
+        "--type",
+        default=EvaluatorType.RELEVANCE.value,
+        choices=[t.value for t in EvaluatorType],
+        help="Evaluator type",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Enable verbose mode",
+    )
+    return parser.parse_args()
+
+
+def run_relevance_evaluator(model_config):
+    relevance_eval = RelevanceEvaluator(model_config)
+
+    relevance_score = relevance_eval(
+        answer="The Alpine Explorer Tent is the most waterproof.",
+        context="From the our product list,"
+        " the alpine explorer tent is the most waterproof."
+        " The Adventure Dining Table has higher weight.",
+        question="Which tent is the most waterproof?",
+    )
+
+    print(relevance_score)
+
+
+class AnswerLengthEvaluator:
+    def __init__(self):
+        pass
+
+    def __call__(self, *, answer: str, **kwargs):
+        return {"answer_length": len(answer)}
+
+
+def run_answer_length_evaluator():
+    evaluator = AnswerLengthEvaluator()
+    answer_length = evaluator(answer="What is the speed of light?")
+    print(answer_length)
+
+
+def get_apology_evaluator(model_config):
+    return load_flow(
+        source=f"{BASE_DIR}/apology.prompty",
+        model={"configuration": model_config},
+    )
+
+
+def run_apology_evaluator(model_config):
+    apology_eval = get_apology_evaluator(model_config)
+
+    # load apology evaluator from prompty file using promptflow
+    apology_score = apology_eval(
+        question="Where can I get my car fixed?",
+        answer="I'm sorry, I don't know that. Would you like me to look it up for you? Sorry for the inconvenience.",
+    )
+    print(apology_score)
+
+
+def run_test_dataset(model_config):
+    result = evaluate(
+        data=f"{BASE_DIR}/data.jsonl",  # provide your data here
+        evaluators={
+            EvaluatorType.RELEVANCE.value: RelevanceEvaluator(model_config),
+            EvaluatorType.ANSWER_LENGTH.value: AnswerLengthEvaluator(),
+            EvaluatorType.APOLOGY.value: get_apology_evaluator(model_config),
+        },
+        # column mapping
+        evaluator_config={
+            "default": {"ground_truth": "${data.ground_truth}"},
+        },
+        # Optionally provide an output path to dump a json of metric summary, row level data and metric and studio URL
+        output_path=f"{BASE_DIR}/results.json",
+    )
+    print(result)
+
+
+if __name__ == "__main__":
+    args = init_args()
+
+    # Set verbose mode
+    if args.verbose:
+        logging.basicConfig(level=logging.DEBUG)
+
+    load_dotenv()
+
+    model_config = AzureOpenAIModelConfiguration(
+        azure_endpoint=getenv("AZURE_OPENAI_ENDPOINT"),
+        api_key=getenv("AZURE_OPENAI_API_KEY"),
+        azure_deployment=getenv("AZURE_OPENAI_GPT_MODEL"),
+        api_version=getenv("AZURE_OPENAI_API_VERSION"),
+    )
+
+    if args.type == EvaluatorType.RELEVANCE.value:
+        run_relevance_evaluator(model_config)
+    elif args.type == EvaluatorType.ANSWER_LENGTH.value:
+        run_answer_length_evaluator()
+    elif args.type == EvaluatorType.APOLOGY.value:
+        run_apology_evaluator(model_config)
+    elif args.type == EvaluatorType.DATASET.value:
+        run_test_dataset(model_config)
+    else:
+        print(f"Invalid evaluator type {args.type}")
+        print(f"Please choose from {', '.join([t.value for t in EvaluatorType])}")
+        exit(1)
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -27,6 +27,7 @@ azure-storage-blob = "^12.22.0"
 requests = "^2.32.3"
 opencv-python-headless = "^4.10.0.84"
 promptflow = "^1.15.0"
+promptflow-evals = "^0.3.2"
 
 [tool.poetry.group.dev.dependencies]
 pre-commit = "^3.8.0"

diff --git a/requirements.txt b/requirements.txt
@@ -4,18 +4,19 @@ streamlit==1.37.1
 azure-cosmos==4.7.0
 plotly==5.23.0
 pandas==2.2.2
-langchain==0.2.12
-langchain-openai==0.1.20
-langchain-community==0.2.11
+langchain==0.2.14
+langchain-openai==0.1.22
+langchain-community==0.2.12
 azure-search-documents==11.5.1
 azure-identity==1.17.1
 azure-ai-documentintelligence==1.0.0b3
 azure-storage-blob==12.22.0
 requests==2.32.3
 promptflow==1.15.0
+promptflow-evals==0.3.2
 
 # To run 99_streamlit_examples/pages/10_Object_Detection.py
-# ultralytics==8.2.77
+# ultralytics==8.2.82
 
 # To run 99_streamlit_examples/pages/11_Pose_Estimation.py
 # mediapipe==0.10.14
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		{"answer": "Paris is the capital of France.", "context": "France is in Europe", "ground_truth": "Paris has been the capital of France since the 10th century and is known for its cultural and historical landmarks.", "question": "What is the capital of France?"}
		{"answer": "I'm sorry, I don't know that. Would you like me to look it up for you?", "context": "Fixing car isn't supported.", "ground_truth": "We don't support car fixing service. I'm sorry, I don't know that. Would you like me to look it up for you?", "question": "Where can I get my car fixed?"}