Skip to content

Commit

Permalink
Merge pull request #110 from ks6088ts-labs/feature/issue-109_promptfl…
Browse files Browse the repository at this point in the history
…ow-evaluation

hands on evaluators with prompt flow sdk
  • Loading branch information
ks6088ts authored Sep 1, 2024
2 parents 02c34a1 + 49e6683 commit 43beadb
Show file tree
Hide file tree
Showing 9 changed files with 275 additions and 6 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ repos:
rev: 24.2.0
hooks:
- id: black
exclude: 'generated/.*|artifacts/.*|.jsonl|.csv|.json'
exclude: 'generated/.*|artifacts/.*|.jsonl|.csv|.json|.prompty'
9 changes: 9 additions & 0 deletions apps/11_promptflow/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,15 @@ $ pf run create \
$ pf run show-details --name $RUN_NAME
```

### evaluators

To guide you through working with evaluators, a helpful document is available at [Evaluate with the prompt flow SDK](https://learn.microsoft.com/azure/ai-studio/how-to/develop/flow-evaluate-sdk).

```shell
# Show help
python apps/11_promptflow/evaluators/main.py --help
```

<!-- TODO: rag, deployments -->

## References
Expand Down
1 change: 1 addition & 0 deletions apps/11_promptflow/evaluators/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
results.json
25 changes: 25 additions & 0 deletions apps/11_promptflow/evaluators/apology.prompty
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
---
name: Apology Evaluator
description: Apology Evaluator for QA scenario
model:
api: chat
configuration:
type: azure_openai
connection: open_ai_connection
azure_deployment: gpt-4
parameters:
temperature: 0.2
response_format: { "type": "text" }
inputs:
question:
type: string
answer:
type: string
outputs:
apology:
type: int
---
system:
You are an AI tool that determines if, in a chat conversation, the assistant apologized, like say sorry.
Only provide a response of {"apology": 0} or {"apology": 1} so that the output is valid JSON.
Give a apology of 1 if apologized in the chat conversation.
2 changes: 2 additions & 0 deletions apps/11_promptflow/evaluators/data.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"answer": "Paris is the capital of France.", "context": "France is in Europe", "ground_truth": "Paris has been the capital of France since the 10th century and is known for its cultural and historical landmarks.", "question": "What is the capital of France?"}
{"answer": "I'm sorry, I don't know that. Would you like me to look it up for you?", "context": "Fixing car isn't supported.", "ground_truth": "We don't support car fixing service. I'm sorry, I don't know that. Would you like me to look it up for you?", "question": "Where can I get my car fixed?"}
135 changes: 135 additions & 0 deletions apps/11_promptflow/evaluators/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import argparse
import logging
from enum import Enum
from os import getenv
from pathlib import Path

from dotenv import load_dotenv
from promptflow.client import load_flow
from promptflow.core import AzureOpenAIModelConfiguration
from promptflow.evals.evaluate import evaluate
from promptflow.evals.evaluators import RelevanceEvaluator

BASE_DIR = Path(__file__).absolute().parent


class EvaluatorType(Enum):
RELEVANCE = "relevance"
ANSWER_LENGTH = "answer_length"
APOLOGY = "apology"
DATASET = "dataset"


def init_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
prog="run_evaluators",
description="Evaluate with the prompt flow SDK",
)
parser.add_argument(
"-t",
"--type",
default=EvaluatorType.RELEVANCE.value,
choices=[t.value for t in EvaluatorType],
help="Evaluator type",
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
help="Enable verbose mode",
)
return parser.parse_args()


def run_relevance_evaluator(model_config):
relevance_eval = RelevanceEvaluator(model_config)

relevance_score = relevance_eval(
answer="The Alpine Explorer Tent is the most waterproof.",
context="From the our product list,"
" the alpine explorer tent is the most waterproof."
" The Adventure Dining Table has higher weight.",
question="Which tent is the most waterproof?",
)

print(relevance_score)


class AnswerLengthEvaluator:
def __init__(self):
pass

def __call__(self, *, answer: str, **kwargs):
return {"answer_length": len(answer)}


def run_answer_length_evaluator():
evaluator = AnswerLengthEvaluator()
answer_length = evaluator(answer="What is the speed of light?")
print(answer_length)


def get_apology_evaluator(model_config):
return load_flow(
source=f"{BASE_DIR}/apology.prompty",
model={"configuration": model_config},
)


def run_apology_evaluator(model_config):
apology_eval = get_apology_evaluator(model_config)

# load apology evaluator from prompty file using promptflow
apology_score = apology_eval(
question="Where can I get my car fixed?",
answer="I'm sorry, I don't know that. Would you like me to look it up for you? Sorry for the inconvenience.",
)
print(apology_score)


def run_test_dataset(model_config):
result = evaluate(
data=f"{BASE_DIR}/data.jsonl", # provide your data here
evaluators={
EvaluatorType.RELEVANCE.value: RelevanceEvaluator(model_config),
EvaluatorType.ANSWER_LENGTH.value: AnswerLengthEvaluator(),
EvaluatorType.APOLOGY.value: get_apology_evaluator(model_config),
},
# column mapping
evaluator_config={
"default": {"ground_truth": "${data.ground_truth}"},
},
# Optionally provide an output path to dump a json of metric summary, row level data and metric and studio URL
output_path=f"{BASE_DIR}/results.json",
)
print(result)


if __name__ == "__main__":
args = init_args()

# Set verbose mode
if args.verbose:
logging.basicConfig(level=logging.DEBUG)

load_dotenv()

model_config = AzureOpenAIModelConfiguration(
azure_endpoint=getenv("AZURE_OPENAI_ENDPOINT"),
api_key=getenv("AZURE_OPENAI_API_KEY"),
azure_deployment=getenv("AZURE_OPENAI_GPT_MODEL"),
api_version=getenv("AZURE_OPENAI_API_VERSION"),
)

if args.type == EvaluatorType.RELEVANCE.value:
run_relevance_evaluator(model_config)
elif args.type == EvaluatorType.ANSWER_LENGTH.value:
run_answer_length_evaluator()
elif args.type == EvaluatorType.APOLOGY.value:
run_apology_evaluator(model_config)
elif args.type == EvaluatorType.DATASET.value:
run_test_dataset(model_config)
else:
print(f"Invalid evaluator type {args.type}")
print(f"Please choose from {', '.join([t.value for t in EvaluatorType])}")
exit(1)
97 changes: 96 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ azure-storage-blob = "^12.22.0"
requests = "^2.32.3"
opencv-python-headless = "^4.10.0.84"
promptflow = "^1.15.0"
promptflow-evals = "^0.3.2"

[tool.poetry.group.dev.dependencies]
pre-commit = "^3.8.0"
Expand Down
9 changes: 5 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,19 @@ streamlit==1.37.1
azure-cosmos==4.7.0
plotly==5.23.0
pandas==2.2.2
langchain==0.2.12
langchain-openai==0.1.20
langchain-community==0.2.11
langchain==0.2.14
langchain-openai==0.1.22
langchain-community==0.2.12
azure-search-documents==11.5.1
azure-identity==1.17.1
azure-ai-documentintelligence==1.0.0b3
azure-storage-blob==12.22.0
requests==2.32.3
promptflow==1.15.0
promptflow-evals==0.3.2

# To run 99_streamlit_examples/pages/10_Object_Detection.py
# ultralytics==8.2.77
# ultralytics==8.2.82

# To run 99_streamlit_examples/pages/11_Pose_Estimation.py
# mediapipe==0.10.14
Expand Down

0 comments on commit 43beadb

Please sign in to comment.