Skip to content

Commit

Permalink
add retrieval context to QA evals and new metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
jalling97 committed Oct 2, 2024
1 parent e62cb74 commit c057be2
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 4 deletions.
12 changes: 10 additions & 2 deletions src/leapfrogai_evals/evals/qa_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@
import numpy as np
import os

from deepeval.metrics import AnswerRelevancyMetric
from deepeval.metrics import (
AnswerRelevancyMetric,
ContextualRelevancyMetric,
FaithfulnessMetric,
)
from deepeval.test_case import LLMTestCase

from leapfrogai_evals.metrics import AnnotationRelevancyMetric, CorrectnessMetric
Expand All @@ -27,11 +31,11 @@ def qa_eval(*args, **kwargs) -> dict:
actual_output=row["actual_output"],
context=row["context"],
expected_output=row["expected_output"],
retrieval_context=row["retrieval_context"],
additional_metadata={
"actual_annotations": row["actual_annotations"],
"expected_annotations": row["expected_annotations"],
},
# retrieval_context = row['retrieval_context'] # TODO: add this for more metrics
)
)

Expand All @@ -45,10 +49,14 @@ def qa_eval(*args, **kwargs) -> dict:
# TODO: Give ability to choose which metrics to run
correctness_metric = CorrectnessMetric(model=judge_model)
answer_relevancy_metric = AnswerRelevancyMetric(model=judge_model)
contextual_relevancy_metric = ContextualRelevancyMetric(model=judge_model)
faithfulness_metric = FaithfulnessMetric(model=judge_model)
annotation_relevancy_metric = AnnotationRelevancyMetric()
metrics = [
correctness_metric,
answer_relevancy_metric,
contextual_relevancy_metric,
faithfulness_metric,
annotation_relevancy_metric,
]

Expand Down
30 changes: 28 additions & 2 deletions src/leapfrogai_evals/runners/qa_runner.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import ast
import logging
import os
import openai
import requests
import shutil
import zipfile

Expand Down Expand Up @@ -52,6 +54,7 @@ def __init__(
self.vector_store = None
self.file_dict = None
self.current_assistant = None
self.api_key = api_key or os.getenv("LEAPFROGAI_API_KEY")
self.dataset_name = os.environ.get("QA_DATASET", dataset)
self.model = os.environ.get("MODEL_TO_EVALUATE", model)
self.temperature = float(os.environ.get("TEMPERATURE", temperature))
Expand All @@ -73,7 +76,7 @@ def __init__(

self.client = openai.OpenAI(
base_url=base_url or os.getenv("LEAPFROGAI_API_URL"),
api_key=api_key or os.getenv("LEAPFROGAI_API_KEY"),
api_key=self.api_key,
)
logging.info(f"client url: {self.client.base_url}")
try: # use existing vector store if supplied
Expand Down Expand Up @@ -101,6 +104,7 @@ def run_experiment(self) -> None:

try:
response_contents = []
retrieved_contexts = []
expected_annotations = []
actual_annotations = []

Expand Down Expand Up @@ -132,21 +136,40 @@ def run_experiment(self) -> None:
response_messages.append(message)

response_content = ""
retrieved_context = []
response_annotations = []
for response in response_messages:
response_content += response.content[0].text.value + "\n"
chunk_ids = ast.literal_eval(response.metadata["vector_ids"])

# retrieve context used to generate response
for chunk_id in chunk_ids:
vector_response = requests.get(
url=os.getenv("LEAPFROGAI_API_LFAI_URL")
+ "/vector_stores/vector/"
+ chunk_id,
headers={
"accept": "application/json",
"Authorization": "Bearer " + self.api_key,
},
).json()
retrieved_context.append(vector_response["content"])

for annotation in response.content[0].text.annotations:
annotation_id = annotation.file_citation.file_id
response_annotations.append(annotation_id)

logging.debug(
logging.info(
f"number of annotations in response: {len(response.content[0].text.annotations)}"
)

expected_annotations.append([self.file_dict[row["source_file"]]])
actual_annotations.append(response_annotations)

logging.info(
f"Retrieved context recorded: {vector_response['content']}"
)
retrieved_contexts.append(retrieved_context)
logging.info(f"Response recorded:\n{response_content}")
response_contents.append(response_content)

Expand All @@ -159,6 +182,9 @@ def run_experiment(self) -> None:
self.qa_data = self.qa_data.add_column(
name="actual_output", column=response_contents
)
self.qa_data = self.qa_data.add_column(
name="retrieval_context", column=retrieved_contexts
)
self.qa_data = self.qa_data.add_column(
name="expected_annotations", column=expected_annotations
)
Expand Down

0 comments on commit c057be2

Please sign in to comment.