diff --git a/src/leapfrogai_evals/evals/qa_eval.py b/src/leapfrogai_evals/evals/qa_eval.py index 88cb60926..941217a85 100644 --- a/src/leapfrogai_evals/evals/qa_eval.py +++ b/src/leapfrogai_evals/evals/qa_eval.py @@ -2,7 +2,11 @@ import numpy as np import os -from deepeval.metrics import AnswerRelevancyMetric +from deepeval.metrics import ( + AnswerRelevancyMetric, + ContextualRelevancyMetric, + FaithfulnessMetric, +) from deepeval.test_case import LLMTestCase from leapfrogai_evals.metrics import AnnotationRelevancyMetric, CorrectnessMetric @@ -27,11 +31,11 @@ def qa_eval(*args, **kwargs) -> dict: actual_output=row["actual_output"], context=row["context"], expected_output=row["expected_output"], + retrieval_context=row["retrieval_context"], additional_metadata={ "actual_annotations": row["actual_annotations"], "expected_annotations": row["expected_annotations"], }, - # retrieval_context = row['retrieval_context'] # TODO: add this for more metrics ) ) @@ -45,10 +49,14 @@ def qa_eval(*args, **kwargs) -> dict: # TODO: Give ability to choose which metrics to run correctness_metric = CorrectnessMetric(model=judge_model) answer_relevancy_metric = AnswerRelevancyMetric(model=judge_model) + contextual_relevancy_metric = ContextualRelevancyMetric(model=judge_model) + faithfulness_metric = FaithfulnessMetric(model=judge_model) annotation_relevancy_metric = AnnotationRelevancyMetric() metrics = [ correctness_metric, answer_relevancy_metric, + contextual_relevancy_metric, + faithfulness_metric, annotation_relevancy_metric, ] diff --git a/src/leapfrogai_evals/runners/qa_runner.py b/src/leapfrogai_evals/runners/qa_runner.py index 4875e2ff8..f06dbb303 100644 --- a/src/leapfrogai_evals/runners/qa_runner.py +++ b/src/leapfrogai_evals/runners/qa_runner.py @@ -1,6 +1,8 @@ +import ast import logging import os import openai +import requests import shutil import zipfile @@ -52,6 +54,7 @@ def __init__( self.vector_store = None self.file_dict = None self.current_assistant = None + self.api_key = api_key or os.getenv("LEAPFROGAI_API_KEY") self.dataset_name = os.environ.get("QA_DATASET", dataset) self.model = os.environ.get("MODEL_TO_EVALUATE", model) self.temperature = float(os.environ.get("TEMPERATURE", temperature)) @@ -73,7 +76,7 @@ def __init__( self.client = openai.OpenAI( base_url=base_url or os.getenv("LEAPFROGAI_API_URL"), - api_key=api_key or os.getenv("LEAPFROGAI_API_KEY"), + api_key=self.api_key, ) logging.info(f"client url: {self.client.base_url}") try: # use existing vector store if supplied @@ -101,6 +104,7 @@ def run_experiment(self) -> None: try: response_contents = [] + retrieved_contexts = [] expected_annotations = [] actual_annotations = [] @@ -132,21 +136,40 @@ def run_experiment(self) -> None: response_messages.append(message) response_content = "" + retrieved_context = [] response_annotations = [] for response in response_messages: response_content += response.content[0].text.value + "\n" + chunk_ids = ast.literal_eval(response.metadata["vector_ids"]) + + # retrieve context used to generate response + for chunk_id in chunk_ids: + vector_response = requests.get( + url=os.getenv("LEAPFROGAI_API_LFAI_URL") + + "/vector_stores/vector/" + + chunk_id, + headers={ + "accept": "application/json", + "Authorization": "Bearer " + self.api_key, + }, + ).json() + retrieved_context.append(vector_response["content"]) for annotation in response.content[0].text.annotations: annotation_id = annotation.file_citation.file_id response_annotations.append(annotation_id) - logging.debug( + logging.info( f"number of annotations in response: {len(response.content[0].text.annotations)}" ) expected_annotations.append([self.file_dict[row["source_file"]]]) actual_annotations.append(response_annotations) + logging.info( + f"Retrieved context recorded: {vector_response['content']}" + ) + retrieved_contexts.append(retrieved_context) logging.info(f"Response recorded:\n{response_content}") response_contents.append(response_content) @@ -159,6 +182,9 @@ def run_experiment(self) -> None: self.qa_data = self.qa_data.add_column( name="actual_output", column=response_contents ) + self.qa_data = self.qa_data.add_column( + name="retrieval_context", column=retrieved_contexts + ) self.qa_data = self.qa_data.add_column( name="expected_annotations", column=expected_annotations )