From 08722bf93a70ec6353d736ee6428c9389debbfce Mon Sep 17 00:00:00 2001 From: Ivan Nardini <88703814+inardini@users.noreply.github.com> Date: Wed, 18 Dec 2024 18:42:51 +0200 Subject: [PATCH] feat: Evaluate agents with GenAI Model Eval (#1555) # Description This PR is about the new Gen AI Evaluation for agent evaluation. --------- Co-authored-by: Holt Skinner Co-authored-by: Holt Skinner <13262395+holtskinner@users.noreply.github.com> --- .github/actions/spelling/allow.txt | 5 + .../evaluation/evaluating_crewai_agent.ipynb | 1571 +++++++++++++++ .../evaluating_langgraph_agent.ipynb | 1561 +++++++++++++++ ...reasoning_engine_customized_template.ipynb | 1694 +++++++++++++++++ ...t_reasoning_engine_prebuilt_template.ipynb | 1528 +++++++++++++++ ...reasoning_engine_customized_template.ipynb | 1691 ++++++++++++++++ 6 files changed, 8050 insertions(+) create mode 100644 gemini/evaluation/evaluating_crewai_agent.ipynb create mode 100644 gemini/evaluation/evaluating_langgraph_agent.ipynb create mode 100644 gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb create mode 100644 gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb create mode 100644 gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb diff --git a/.github/actions/spelling/allow.txt b/.github/actions/spelling/allow.txt index 001bbc2492..29e1122872 100644 --- a/.github/actions/spelling/allow.txt +++ b/.github/actions/spelling/allow.txt @@ -591,6 +591,7 @@ bqdf bqml breakroom btn +byod byor carbonara cashify @@ -635,6 +636,7 @@ constexpr corpuses countplot cpet +crewai csa cse ctd @@ -671,6 +673,7 @@ doi dotprompt dpi draig +drilldown drinkware dropdown dropna @@ -678,6 +681,7 @@ dsl dtype dtypes dumfries +dumpd dwmapi ecommerce ekg @@ -1152,6 +1156,7 @@ unigram unrtf upsell urandom +usb usebackq usecases username diff --git a/gemini/evaluation/evaluating_crewai_agent.ipynb b/gemini/evaluation/evaluating_crewai_agent.ipynb new file mode 100644 index 0000000000..aa3395af1e --- /dev/null +++ b/gemini/evaluation/evaluating_crewai_agent.ipynb @@ -0,0 +1,1571 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Evaluating Agents - Evaluate a CrewAI agent with Vertex AI Gen AI Evaluation\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Google
Open in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Open in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "Share to:\n", + "\n", + "\n", + " \"LinkedIn\n", + "\n", + "\n", + "\n", + " \"Bluesky\n", + "\n", + "\n", + "\n", + " \"X\n", + "\n", + "\n", + "\n", + " \"Reddit\n", + "\n", + "\n", + "\n", + " \"Facebook\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "84f0f73a0f76" + }, + "source": [ + "| | |\n", + "|-|-|\n", + "| Authors | [Ivan Nardini](https://github.com/inardini), [Naveksha Sood](https://github.com/navekshasood) |" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", + "\n", + "Vertex AI Model Evaluation provides a toolkit of quality controlled and explainable methods and metrics to evaluate any generative model or application, including agents, and benchmark the evaluation results against your own judgment, using your own evaluation criteria.\n", + "\n", + "This tutorial shows how to evaluate a Crew AI agent using Vertex AI Gen AI Evaluation for agent evaluation.\n", + "\n", + "The tutorial uses the following Google Cloud services and resources:\n", + "\n", + "* Vertex AI Gen AI Evaluation\n", + "\n", + "The steps performed include:\n", + "\n", + "* Build local agent using Crew AI\n", + "* Prepare Agent Evaluation dataset\n", + "* Single tool usage evaluation\n", + "* Trajectory evaluation\n", + "* Response evaluation\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "61RBz8LLbxCR" + }, + "source": [ + "## Get started" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "No17Cw5hgx12" + }, + "source": [ + "### Install Vertex AI SDK and other required packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tFy3H3aPgx12" + }, + "outputs": [], + "source": [ + "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation]\" \\\n", + " \"crewai\" \"crewai-tools\" \\\n", + " \"cloudpickle==3.0.0\" \\\n", + " \"pydantic==2.7.4\" \\\n", + " \"requests\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R5Xep4W9lq-Z" + }, + "source": [ + "### Restart runtime\n", + "\n", + "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", + "\n", + "The restart might take a minute or longer. After it's restarted, continue to the next step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XRvKdaPDTznN" + }, + "outputs": [], + "source": [ + "import IPython\n", + "\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SbmM4z7FOBpM" + }, + "source": [ + "
\n", + "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dmWOrTJ3gx13" + }, + "source": [ + "### Authenticate your notebook environment (Colab only)\n", + "\n", + "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NyKGtVQjgx13" + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " from google.colab import auth\n", + "\n", + " auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF4l8DTdWgPY" + }, + "source": [ + "### Set Google Cloud project information and initialize Vertex AI SDK\n", + "\n", + "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nqwi-5ufWp_B" + }, + "outputs": [], + "source": [ + "# Use the environment variable if the user doesn't provide Project ID.\n", + "import os\n", + "\n", + "import vertexai\n", + "\n", + "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", + "\n", + "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", + " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", + "os.environ[\"GOOGLE_CLOUD_PROJECT\"] = PROJECT_ID\n", + "\n", + "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", + "\n", + "EXPERIMENT_NAME = \"evaluate-crewai-agent\" # @param {type:\"string\"}\n", + "\n", + "vertexai.init(project=PROJECT_ID, location=LOCATION, experiment=EXPERIMENT_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5303c05f7aa6" + }, + "source": [ + "## Import libraries\n", + "\n", + "Import tutorial libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6fc324893334" + }, + "outputs": [], + "source": [ + "# General\n", + "import random\n", + "import string\n", + "import warnings\n", + "\n", + "from IPython.display import HTML, Markdown, display\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "\n", + "warnings.filterwarnings(\"ignore\", category=Warning, module=\"opentelemetry.trace\")\n", + "\n", + "# Build agent\n", + "from crewai import Agent, Crew, Process, Task\n", + "from crewai.flow.flow import Flow, listen, start\n", + "from crewai_tools import tool\n", + "\n", + "# Evaluate agent\n", + "from google.cloud import aiplatform\n", + "from vertexai.preview.evaluation import EvalTask\n", + "from vertexai.preview.evaluation.metrics import (\n", + " PointwiseMetric,\n", + " PointwiseMetricPromptTemplate,\n", + " TrajectorySingleToolUse,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MVnBDX54gz7j" + }, + "source": [ + "## Define helper functions\n", + "\n", + "Initiate a set of helper functions to print tutorial results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uSgWjMD_g1_v" + }, + "outputs": [], + "source": [ + "def get_id(length: int = 8) -> str:\n", + " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", + " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", + "\n", + "\n", + "def parse_crewai_output_to_dictionary(crew, crew_output):\n", + " \"\"\"\n", + " Parse CrewAI output into a structured dictionary format.\n", + " \"\"\"\n", + " final_output = {\"response\": str(crew_output), \"predicted_trajectory\": []}\n", + "\n", + " for agent in crew.agents:\n", + " try:\n", + " for tool_result in agent.tools_results:\n", + " tool_info = {\n", + " \"tool_name\": tool_result.get(\"tool_name\", \"\"),\n", + " \"tool_input\": tool_result.get(\"tool_args\", {}),\n", + " }\n", + " final_output[\"predicted_trajectory\"].append(tool_info)\n", + " except AttributeError as e:\n", + " final_output[\"error\"] = f\"Agent does not have tools_results: {str(e)}\"\n", + " print(f\"Error: {e}\")\n", + "\n", + " return final_output\n", + "\n", + "\n", + "def format_output_as_markdown(output: dict) -> str:\n", + " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", + " markdown = \"### AI Response\\n\"\n", + " markdown += f\"{output['response']}\\n\\n\"\n", + "\n", + " if output[\"predicted_trajectory\"]:\n", + " markdown += \"### Function Calls\\n\"\n", + " for call in output[\"predicted_trajectory\"]:\n", + " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", + " markdown += \" - **Arguments**:\\n\"\n", + " for key, value in call[\"tool_input\"].items():\n", + " markdown += f\" - `{key}`: `{value}`\\n\"\n", + "\n", + " return markdown\n", + "\n", + "\n", + "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", + " \"\"\"Display the evaluation results.\"\"\"\n", + " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", + " display(Markdown(\"### Summary Metrics\"))\n", + " display(metrics_df)\n", + "\n", + " display(Markdown(f\"### Row-wise Metrics\"))\n", + " display(eval_result.metrics_table)\n", + "\n", + "\n", + "def display_drilldown(row: pd.Series) -> None:\n", + " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", + "\n", + " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", + "\n", + " if not (\n", + " isinstance(row[\"predicted_trajectory\"], list)\n", + " and isinstance(row[\"reference_trajectory\"], list)\n", + " ):\n", + " return\n", + "\n", + " for predicted_trajectory, reference_trajectory in zip(\n", + " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", + " ):\n", + " display(\n", + " HTML(\n", + " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", + " )\n", + " )\n", + "\n", + " if not (\n", + " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", + " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", + " ):\n", + " continue\n", + "\n", + " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", + " print(\"Tool Input Key: \", tool_input_key)\n", + "\n", + " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " reference_trajectory[\"tool_input\"][tool_input_key],\n", + " )\n", + " else:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " \"N/A\",\n", + " )\n", + " print(\"\\n\")\n", + " display(HTML(\"
\"))\n", + "\n", + "\n", + "def display_dataframe_rows(\n", + " df: pd.DataFrame,\n", + " columns: list[str] | None = None,\n", + " num_rows: int = 3,\n", + " display_drilldown: bool = False,\n", + ") -> None:\n", + " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", + "\n", + " if columns:\n", + " df = df[columns]\n", + "\n", + " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", + " header_style = base_style + \"font-weight: bold;\"\n", + "\n", + " for _, row in df.head(num_rows).iterrows():\n", + " for column in df.columns:\n", + " display(\n", + " HTML(\n", + " f\"{column.replace('_', ' ').title()}: \"\n", + " )\n", + " )\n", + " display(HTML(f\"{row[column]}
\"))\n", + "\n", + " display(HTML(\"
\"))\n", + "\n", + " if (\n", + " display_drilldown\n", + " and \"predicted_trajectory\" in df.columns\n", + " and \"reference_trajectory\" in df.columns\n", + " ):\n", + " display_drilldown(row)\n", + "\n", + "\n", + "def plot_bar_plot(\n", + " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", + ") -> None:\n", + " fig = go.Figure()\n", + " data = []\n", + "\n", + " summary_metrics = eval_result.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " data.append(\n", + " go.Bar(\n", + " x=list(summary_metrics.keys()),\n", + " y=list(summary_metrics.values()),\n", + " name=title,\n", + " )\n", + " )\n", + "\n", + " fig = go.Figure(data=data)\n", + "\n", + " # Change the bar mode\n", + " fig.update_layout(barmode=\"group\")\n", + " fig.show()\n", + "\n", + "\n", + "def display_radar_plot(eval_results, title: str, metrics=None):\n", + " \"\"\"Plot the radar plot.\"\"\"\n", + " fig = go.Figure()\n", + " summary_metrics = eval_results.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " min_val = min(summary_metrics.values())\n", + " max_val = max(summary_metrics.values())\n", + "\n", + " fig.add_trace(\n", + " go.Scatterpolar(\n", + " r=list(summary_metrics.values()),\n", + " theta=list(summary_metrics.keys()),\n", + " fill=\"toself\",\n", + " name=title,\n", + " )\n", + " )\n", + " fig.update_layout(\n", + " title=title,\n", + " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", + " showlegend=True,\n", + " )\n", + " fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bDaa2Mtsifmq" + }, + "source": [ + "## Build CrewAI agent\n", + "\n", + "Build your application using CrewAI, including the Gemini model and custom tools that you define." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KHwShhpOitKp" + }, + "source": [ + "### Set tools\n", + "\n", + "To start, set the tools that a customer support agent needs to do their job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gA2ZKvfeislw" + }, + "outputs": [], + "source": [ + "@tool\n", + "def get_product_details(product_name: str):\n", + " \"\"\"Gathers basic details about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", + " \"usb charger\": \"A super fast and light usb charger\",\n", + " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", + " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", + " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " }\n", + " return details.get(product_name, \"Product details not found.\")\n", + "\n", + "\n", + "@tool\n", + "def get_product_price(product_name: str):\n", + " \"\"\"Gathers price about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": 500,\n", + " \"usb charger\": 10,\n", + " \"shoes\": 100,\n", + " \"headphones\": 50,\n", + " \"speaker\": 80,\n", + " }\n", + " return details.get(product_name, \"Product price not found.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "be70714d9fae" + }, + "source": [ + "### Define router using Flow\n", + "\n", + "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "516b5108d327" + }, + "outputs": [], + "source": [ + "class ProductFlow(Flow):\n", + " @start\n", + " def begin_flow(self):\n", + " \"\"\"Starts the product information flow\"\"\"\n", + " return \"check_request\"\n", + "\n", + " @listen(\"check_request\")\n", + " def router(self, state: dict) -> str:\n", + " \"\"\"Routes the product request to appropriate handler\"\"\"\n", + " # Get the last message from the state\n", + " last_message = state.get(\"last_message\", {})\n", + " tool_calls = last_message.get(\"tool_calls\", [])\n", + "\n", + " if tool_calls:\n", + " function_name = tool_calls[0].get(\"name\")\n", + " if function_name == \"get_product_price\":\n", + " return \"get_product_price\"\n", + " else:\n", + " return \"get_product_details\"\n", + " return \"end\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l4mk5XPui4Y1" + }, + "source": [ + "### Set the model\n", + "\n", + "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BaYeo6K2i-w1" + }, + "outputs": [], + "source": [ + "model = \"vertex_ai/gemini-1.5-pro-002\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNlAY9cojEWz" + }, + "source": [ + "### Assemble the agent\n", + "\n", + "The Vertex AI Gen AI Evaluation works directly with 'Queryable' agents, and also lets you add your own custom functions with a specific structure (signature).\n", + "\n", + "In this case, you assemble the agent using a custom function. The function triggers the agent for a given input and parse the agent outcome to extract the response and called tools." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dAFdi7SujGP8" + }, + "outputs": [], + "source": [ + "def agent_parsed_outcome(input):\n", + " product_researcher = Agent(\n", + " role=\"Product Researcher\",\n", + " goal=\"Research product details and prices accurately\",\n", + " backstory=\"Expert at gathering and analyzing product information\",\n", + " llm=model,\n", + " tools=[get_product_details, get_product_price],\n", + " allow_delegation=False,\n", + " )\n", + "\n", + " # Create task based on the input\n", + " research_task = Task(\n", + " description=f\"Analyze this user request: '{input}'. \"\n", + " f\"If the request is about price, use get_product_price tool. \"\n", + " f\"Otherwise, use get_product_details tool to get product information.\",\n", + " expected_output=\"Product information including details and/or price based on the user request.\",\n", + " agent=product_researcher,\n", + " )\n", + "\n", + " # Create crew with sequential process\n", + " crew = Crew(\n", + " agents=[product_researcher],\n", + " tasks=[research_task],\n", + " process=Process.sequential,\n", + " )\n", + "\n", + " result = crew.kickoff()\n", + " return parse_crewai_output_to_dictionary(crew, result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_HGcs6PVjRj_" + }, + "source": [ + "### Test the agent\n", + "\n", + "Query your agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lGb58OJkjUs9" + }, + "outputs": [], + "source": [ + "response = agent_parsed_outcome(input=\"Get product details for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2wCFstt8w4Dx" + }, + "outputs": [], + "source": [ + "response = agent_parsed_outcome(input=\"Get product price for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aOGPePsorpUl" + }, + "source": [ + "## Evaluating a CrewAI agent with Vertex AI Gen AI Evaluation\n", + "\n", + "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", + "\n", + "Monitoring focuses on how well your agent is performing specific tasks:\n", + "\n", + "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", + "\n", + "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", + "\n", + "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", + "\n", + "Observability is about understanding the overall health of the agent:\n", + "\n", + "* **Latency**: How long does it take the agent to respond?\n", + "\n", + "* **Failure Rate**: How often does the agent fail to produce a response?\n", + "\n", + "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e43229f3ad4f" + }, + "source": [ + "### Prepare Agent Evaluation dataset\n", + "\n", + "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", + "\n", + "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", + "\n", + "> Optionally, you can provide both generated responses and predicted trajectory (**Bring-Your-Own-Dataset scenario**).\n", + "\n", + "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fFf8uTdUiDt3" + }, + "outputs": [], + "source": [ + "eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + "}\n", + "\n", + "eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PQEI1EcfvFHb" + }, + "source": [ + "Print some samples from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EjsonqWWvIvE" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(eval_sample_dataset, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m4CvBuf1afHG" + }, + "source": [ + "### Single tool usage evaluation\n", + "\n", + "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_rS5GGKHd5bx" + }, + "source": [ + "#### Set single tool usage metrics\n", + "\n", + "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", + "\n", + "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xixvq8dwd5by" + }, + "outputs": [], + "source": [ + "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ktKZoT2Qd5by" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using agent_parsed_outcome function and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SRv43fDcd5by" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"\n", + "\n", + "single_tool_call_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=single_tool_usage_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(single_tool_call_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6o5BjSTFKVMS" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Use some helper functions to visualize a sample of evaluation result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1Jopzw83k14w" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JlujdJpu5Kn6" + }, + "source": [ + "### Trajectory Evaluation\n", + "\n", + "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8s-nHdDJneHM" + }, + "source": [ + "#### Set trajectory metrics\n", + "\n", + "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", + "\n", + "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", + "\n", + "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", + "\n", + "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", + "\n", + "* `trajectory_precision`: proportion of predicted actions present in reference\n", + "\n", + "* `trajectory_recall`: proportion of reference actions present in predicted. \n", + "\n", + "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c32WIS95neHN" + }, + "outputs": [], + "source": [ + "trajectory_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"trajectory_any_order_match\",\n", + " \"trajectory_precision\",\n", + " \"trajectory_recall\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF3jhTH3neHN" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Submit an evaluation by running `evaluate` method of the new `EvalTask`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vOdS7TJUneHN" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"trajectory-{get_id()}\"\n", + "\n", + "trajectory_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "trajectory_eval_result = trajectory_eval_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(trajectory_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DBiUI3LyLBtj" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Print and visualize a sample of evaluation results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "z7-LdM3mLBtk" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sLVRdN5llA0h" + }, + "outputs": [], + "source": [ + "plot_bar_plot(\n", + " trajectory_eval_result,\n", + " title=\"Trajectory Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T8TipU2akHEd" + }, + "source": [ + "### Evaluate final response\n", + "\n", + "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DeK-py7ykkDN" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", + "\n", + "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cyGHGgeVklvz" + }, + "outputs": [], + "source": [ + "response_metrics = [\"safety\", \"coherence\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DaBJWcg1kn55" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wRb2EC_hknSD" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"response-{get_id()}\"\n", + "\n", + "response_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "response_eval_result = response_eval_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(response_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JtewTwiwg9qH" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZODTRuq2lF75" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ntRBK3Te6PEc" + }, + "source": [ + "### Evaluate generated response conditioned by tool choosing\n", + "\n", + "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", + "\n", + "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4bENwFcd6prX" + }, + "source": [ + "#### Define a custom metric\n", + "\n", + "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", + "\n", + "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "txGEHcg76riI" + }, + "outputs": [], + "source": [ + "criteria = {\n", + " \"Follows trajectory\": (\n", + " \"Evaluate whether the agent's response logically follows from the \"\n", + " \"sequence of actions it took. Consider these sub-points:\\n\"\n", + " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", + " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", + " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", + " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", + " )\n", + "}\n", + "\n", + "pointwise_rating_rubric = {\n", + " \"1\": \"Follows trajectory\",\n", + " \"0\": \"Does not follow trajectory\",\n", + "}\n", + "\n", + "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", + " criteria=criteria,\n", + " rating_rubric=pointwise_rating_rubric,\n", + " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8MJqXu0kikxd" + }, + "source": [ + "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5EL7iEDMikNQ" + }, + "outputs": [], + "source": [ + "print(response_follows_trajectory_prompt_template.prompt_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e1djVp7Fi4Yy" + }, + "source": [ + "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nx1xbZD87iMj" + }, + "outputs": [], + "source": [ + "response_follows_trajectory_metric = PointwiseMetric(\n", + " metric=\"response_follows_trajectory\",\n", + " metric_prompt_template=response_follows_trajectory_prompt_template,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1pmxLwTe7Ywv" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "Set new generated response evaluation metrics by including the custom metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wrsbVFDd7Ywv" + }, + "outputs": [], + "source": [ + "response_tool_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"safety\",\n", + " response_follows_trajectory_metric,\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lo-Sza807Ywv" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_dkb4gSn7Ywv" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"response-over-tools-{get_id()}\"\n", + "\n", + "response_eval_tool_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "response_eval_tool_result = response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AtOfIFi2j88g" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GH2YvXgLlLH7" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tdVhCURXMdLG" + }, + "outputs": [], + "source": [ + "plot_bar_plot(\n", + " response_eval_tool_result,\n", + " title=\"Response Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4nuUDP3a2eTB" + }, + "source": [ + "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", + "\n", + "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DRLKlmWd27PK" + }, + "source": [ + "### Bring your own evaluation dataset\n", + "\n", + "Define the evaluation dataset with the predicted trajectory and the generated response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y9hBgsg324Ej" + }, + "outputs": [], + "source": [ + "byod_eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"predicted_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"response\": [\n", + " 500,\n", + " 50,\n", + " \"A super fast and light usb charger\",\n", + " 100,\n", + " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " ],\n", + "}\n", + "\n", + "byod_eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oEYmU2eJ7q-1" + }, + "source": [ + "### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wBD-4wpB7q-3" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", + "\n", + "byod_response_eval_tool_task = EvalTask(\n", + " dataset=byod_eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", + " experiment_run_name=EXPERIMENT_RUN_NAME\n", + ")\n", + "\n", + "display_eval_report(byod_response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9eU3LG6r7q-3" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pQFzmd2I7q-3" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "84HiPDOkPseW" + }, + "outputs": [], + "source": [ + "display_radar_plot(\n", + " byod_response_eval_tool_result,\n", + " title=\"Agent evaluation metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2a4e033321ad" + }, + "source": [ + "## Cleaning up\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ox2I3UfRlTOd" + }, + "outputs": [], + "source": [ + "delete_experiment = True\n", + "\n", + "if delete_experiment:\n", + " try:\n", + " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", + " experiment.delete(delete_backing_tensorboard_runs=True)\n", + " except Exception as e:\n", + " print(e)" + ] + } + ], + "metadata": { + "colab": { + "name": "evaluating_crewai_agent.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/gemini/evaluation/evaluating_langgraph_agent.ipynb b/gemini/evaluation/evaluating_langgraph_agent.ipynb new file mode 100644 index 0000000000..7b7aafe841 --- /dev/null +++ b/gemini/evaluation/evaluating_langgraph_agent.ipynb @@ -0,0 +1,1561 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Evaluating Agents - Evaluate a LangGraph agent with Vertex AI Gen AI Evaluation\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Google
Open in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Open in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "Share to:\n", + "\n", + "\n", + " \"LinkedIn\n", + "\n", + "\n", + "\n", + " \"Bluesky\n", + "\n", + "\n", + "\n", + " \"X\n", + "\n", + "\n", + "\n", + " \"Reddit\n", + "\n", + "\n", + "\n", + " \"Facebook\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "84f0f73a0f76" + }, + "source": [ + "| | |\n", + "|-|-|\n", + "| Authors | [Ivan Nardini](https://github.com/inardini) [Naveksha Sood](https://github.com/navekshasood)|" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", + "\n", + "Vertex AI Model Evaluation provides a toolkit of quality controlled and explainable methods and metrics to evaluate any generative model or application, including agents, and benchmark the evaluation results against your own judgment, using your own evaluation criteria.\n", + "\n", + "This tutorial shows how to evaluate a LangGraph agent using Vertex AI Gen AI Evaluation for agent evaluation.\n", + "\n", + "The tutorial uses the following Google Cloud services and resources:\n", + "\n", + "* Vertex AI Gen AI Evaluation\n", + "\n", + "The steps performed include:\n", + "\n", + "* Build local agent using LangGraph\n", + "* Prepare Agent Evaluation dataset\n", + "* Single tool usage evaluation\n", + "* Trajectory evaluation\n", + "* Response evaluation\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "61RBz8LLbxCR" + }, + "source": [ + "## Get started" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "No17Cw5hgx12" + }, + "source": [ + "### Install Vertex AI SDK and other required packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tFy3H3aPgx12" + }, + "outputs": [], + "source": [ + "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation]\" \\\n", + " \"langchain_google_vertexai\" \\\n", + " \"langgraph\" \\\n", + " \"cloudpickle==3.0.0\" \\\n", + " \"pydantic==2.7.4\" \\\n", + " \"requests\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R5Xep4W9lq-Z" + }, + "source": [ + "### Restart runtime\n", + "\n", + "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", + "\n", + "The restart might take a minute or longer. After it's restarted, continue to the next step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XRvKdaPDTznN" + }, + "outputs": [], + "source": [ + "import IPython\n", + "\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SbmM4z7FOBpM" + }, + "source": [ + "
\n", + "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dmWOrTJ3gx13" + }, + "source": [ + "### Authenticate your notebook environment (Colab only)\n", + "\n", + "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NyKGtVQjgx13" + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " from google.colab import auth\n", + "\n", + " auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF4l8DTdWgPY" + }, + "source": [ + "### Set Google Cloud project information and initialize Vertex AI SDK\n", + "\n", + "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nqwi-5ufWp_B" + }, + "outputs": [], + "source": [ + "# Use the environment variable if the user doesn't provide Project ID.\n", + "import os\n", + "\n", + "import vertexai\n", + "\n", + "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", + "\n", + "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", + " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", + "\n", + "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", + "\n", + "EXPERIMENT_NAME = \"evaluate-langgraph-agent\" # @param {type:\"string\"}\n", + "\n", + "vertexai.init(project=PROJECT_ID, location=LOCATION, experiment=EXPERIMENT_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5303c05f7aa6" + }, + "source": [ + "## Import libraries\n", + "\n", + "Import tutorial libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6fc324893334" + }, + "outputs": [], + "source": [ + "# General\n", + "import random\n", + "import string\n", + "from typing import Literal\n", + "\n", + "from IPython.display import HTML, Markdown, display\n", + "\n", + "# Evaluate agent\n", + "from google.cloud import aiplatform\n", + "from langchain.load import dump as langchain_load_dump\n", + "\n", + "# Build agent\n", + "from langchain_core.messages import BaseMessage, HumanMessage\n", + "from langchain_core.tools import tool\n", + "from langchain_google_vertexai import ChatVertexAI\n", + "from langgraph.graph import END, MessageGraph\n", + "from langgraph.prebuilt import ToolNode\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "from vertexai.preview.evaluation import EvalTask\n", + "from vertexai.preview.evaluation.metrics import (\n", + " PointwiseMetric,\n", + " PointwiseMetricPromptTemplate,\n", + " TrajectorySingleToolUse,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MVnBDX54gz7j" + }, + "source": [ + "## Define helper functions\n", + "\n", + "Initiate a set of helper functions to print tutorial results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uSgWjMD_g1_v" + }, + "outputs": [], + "source": [ + "def get_id(length: int = 8) -> str:\n", + " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", + " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", + "\n", + "\n", + "def parse_messages_to_output_dictionary(messages: list[dict]) -> dict:\n", + " \"\"\"Parse response and function calls from a list of messages in the constructor format.\"\"\"\n", + "\n", + " final_output = {\n", + " \"response\": \"No AI response found in the message history.\",\n", + " \"predicted_trajectory\": [],\n", + " }\n", + "\n", + " # Process each message\n", + " function_calls = []\n", + " for message in messages:\n", + " # Check if it's a Tool message which contains the actual response\n", + " if message.get(\"type\") == \"constructor\" and \"ToolMessage\" in message.get(\n", + " \"id\", []\n", + " ):\n", + " final_output[\"response\"] = message[\"kwargs\"][\"content\"]\n", + "\n", + " # Check if it's an AI message to get tool calls\n", + " elif message.get(\"type\") == \"constructor\" and \"AIMessage\" in message.get(\n", + " \"id\", []\n", + " ):\n", + " tool_calls = message[\"kwargs\"].get(\"tool_calls\", [])\n", + " for tool_call in tool_calls:\n", + " if tool_call:\n", + " function_calls.append(\n", + " {\n", + " \"tool_name\": tool_call.get(\"name\"),\n", + " \"tool_input\": tool_call.get(\"args\"),\n", + " }\n", + " )\n", + "\n", + " final_output[\"predicted_trajectory\"] = function_calls\n", + " return final_output\n", + "\n", + "\n", + "def format_output_as_markdown(output: dict) -> str:\n", + " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", + " markdown = \"### AI Response\\n\"\n", + " markdown += f\"{output['response']}\\n\\n\"\n", + "\n", + " if output[\"predicted_trajectory\"]:\n", + " markdown += \"### Function Calls\\n\"\n", + " for call in output[\"predicted_trajectory\"]:\n", + " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", + " markdown += \" - **Arguments**:\\n\"\n", + " for key, value in call[\"tool_input\"].items():\n", + " markdown += f\" - `{key}`: `{value}`\\n\"\n", + "\n", + " return markdown\n", + "\n", + "\n", + "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", + " \"\"\"Display the evaluation results.\"\"\"\n", + " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", + " display(Markdown(\"### Summary Metrics\"))\n", + " display(metrics_df)\n", + "\n", + " display(Markdown(f\"### Row-wise Metrics\"))\n", + " display(eval_result.metrics_table)\n", + "\n", + "\n", + "def display_drilldown(row: pd.Series) -> None:\n", + " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", + "\n", + " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", + "\n", + " if not (\n", + " isinstance(row[\"predicted_trajectory\"], list)\n", + " and isinstance(row[\"reference_trajectory\"], list)\n", + " ):\n", + " return\n", + "\n", + " for predicted_trajectory, reference_trajectory in zip(\n", + " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", + " ):\n", + " display(\n", + " HTML(\n", + " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", + " )\n", + " )\n", + "\n", + " if not (\n", + " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", + " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", + " ):\n", + " continue\n", + "\n", + " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", + " print(\"Tool Input Key: \", tool_input_key)\n", + "\n", + " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " reference_trajectory[\"tool_input\"][tool_input_key],\n", + " )\n", + " else:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " \"N/A\",\n", + " )\n", + " print(\"\\n\")\n", + " display(HTML(\"
\"))\n", + "\n", + "\n", + "def display_dataframe_rows(\n", + " df: pd.DataFrame,\n", + " columns: list[str] | None = None,\n", + " num_rows: int = 3,\n", + " display_drilldown: bool = False,\n", + ") -> None:\n", + " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", + "\n", + " if columns:\n", + " df = df[columns]\n", + "\n", + " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", + " header_style = base_style + \"font-weight: bold;\"\n", + "\n", + " for _, row in df.head(num_rows).iterrows():\n", + " for column in df.columns:\n", + " display(\n", + " HTML(\n", + " f\"{column.replace('_', ' ').title()}: \"\n", + " )\n", + " )\n", + " display(HTML(f\"{row[column]}
\"))\n", + "\n", + " display(HTML(\"
\"))\n", + "\n", + " if (\n", + " display_drilldown\n", + " and \"predicted_trajectory\" in df.columns\n", + " and \"reference_trajectory\" in df.columns\n", + " ):\n", + " display_drilldown(row)\n", + "\n", + "\n", + "def plot_bar_plot(\n", + " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", + ") -> None:\n", + " fig = go.Figure()\n", + " data = []\n", + "\n", + " summary_metrics = eval_result.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " data.append(\n", + " go.Bar(\n", + " x=list(summary_metrics.keys()),\n", + " y=list(summary_metrics.values()),\n", + " name=title,\n", + " )\n", + " )\n", + "\n", + " fig = go.Figure(data=data)\n", + "\n", + " # Change the bar mode\n", + " fig.update_layout(barmode=\"group\")\n", + " fig.show()\n", + "\n", + "\n", + "def display_radar_plot(eval_results, title: str, metrics=None):\n", + " \"\"\"Plot the radar plot.\"\"\"\n", + " fig = go.Figure()\n", + " summary_metrics = eval_results.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " min_val = min(summary_metrics.values())\n", + " max_val = max(summary_metrics.values())\n", + "\n", + " fig.add_trace(\n", + " go.Scatterpolar(\n", + " r=list(summary_metrics.values()),\n", + " theta=list(summary_metrics.keys()),\n", + " fill=\"toself\",\n", + " name=title,\n", + " )\n", + " )\n", + " fig.update_layout(\n", + " title=title,\n", + " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", + " showlegend=True,\n", + " )\n", + " fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bDaa2Mtsifmq" + }, + "source": [ + "## Build LangGraph agent\n", + "\n", + "Build your application using LangGraph, including the Gemini model, custom tools that you define and a router to control the conversational flow." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KHwShhpOitKp" + }, + "source": [ + "### Set tools\n", + "\n", + "To start, set the tools that a customer support agent needs to do their job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gA2ZKvfeislw" + }, + "outputs": [], + "source": [ + "@tool\n", + "def get_product_details(product_name: str):\n", + " \"\"\"Gathers basic details about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", + " \"usb charger\": \"A super fast and light usb charger\",\n", + " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", + " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", + " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " }\n", + " return details.get(product_name, \"Product details not found.\")\n", + "\n", + "\n", + "@tool\n", + "def get_product_price(product_name: str):\n", + " \"\"\"Gathers price about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": 500,\n", + " \"usb charger\": 10,\n", + " \"shoes\": 100,\n", + " \"headphones\": 50,\n", + " \"speaker\": 80,\n", + " }\n", + " return details.get(product_name, \"Product price not found.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "be70714d9fae" + }, + "source": [ + "### Define router\n", + "\n", + "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "516b5108d327" + }, + "outputs": [], + "source": [ + "def router(\n", + " state: list[BaseMessage],\n", + ") -> Literal[\"get_product_details\", \"get_product_price\", \"__end__\"]:\n", + " \"\"\"Initiates product details or price retrieval if the user asks for a product.\"\"\"\n", + " # Get the tool_calls from the last message in the conversation history.\n", + " tool_calls = state[-1].tool_calls\n", + "\n", + " # If there are any tool_calls\n", + " if tool_calls:\n", + " # Check the function name in the first tool call\n", + " function_name = tool_calls[0].get(\"name\")\n", + " if function_name == \"get_product_price\":\n", + " return \"get_product_price\"\n", + " else:\n", + " return \"get_product_details\"\n", + " else:\n", + " # End the conversation flow.\n", + " return \"__end__\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l4mk5XPui4Y1" + }, + "source": [ + "### Set the model\n", + "\n", + "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BaYeo6K2i-w1" + }, + "outputs": [], + "source": [ + "model = \"gemini-1.5-pro\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNlAY9cojEWz" + }, + "source": [ + "### Assemble the agent\n", + "\n", + "The Vertex AI Gen AI Evaluation works directly with 'Queryable' agents, and also lets you add your own custom functions with a specific structure (signature).\n", + "\n", + "In this case, you assemble the agent using a custom function. The function triggers the agent for a given input and parse the agent outcome to extract the response and called tools." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dAFdi7SujGP8" + }, + "outputs": [], + "source": [ + "def agent_parsed_outcome(input):\n", + "\n", + " model = ChatVertexAI(model=model)\n", + " builder = MessageGraph()\n", + "\n", + " model_with_tools = model.bind_tools([get_product_details, get_product_price])\n", + " builder.add_node(\"tools\", model_with_tools)\n", + "\n", + " tool_node = ToolNode([get_product_details, get_product_price])\n", + " builder.add_node(\"get_product_details\", tool_node)\n", + " builder.add_node(\"get_product_price\", tool_node)\n", + " builder.add_edge(\"get_product_details\", END)\n", + " builder.add_edge(\"get_product_price\", END)\n", + "\n", + " builder.set_entry_point(\"tools\")\n", + " builder.add_conditional_edges(\"tools\", router)\n", + "\n", + " app = builder.compile()\n", + " chat_history = langchain_load_dump.dumpd(app.invoke(HumanMessage(input)))\n", + " return parse_messages_to_output_dictionary(chat_history)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_HGcs6PVjRj_" + }, + "source": [ + "### Test the agent\n", + "\n", + "Query your agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lGb58OJkjUs9" + }, + "outputs": [], + "source": [ + "response = agent_parsed_outcome(input=\"Get product details for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2wCFstt8w4Dx" + }, + "outputs": [], + "source": [ + "response = agent_parsed_outcome(input=\"Get product price for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aOGPePsorpUl" + }, + "source": [ + "## Evaluating a LangGraph agent with Vertex AI Gen AI Evaluation\n", + "\n", + "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", + "\n", + "Monitoring focuses on how well your agent is performing specific tasks:\n", + "\n", + "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", + "\n", + "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", + "\n", + "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", + "\n", + "Observability is about understanding the overall health of the agent:\n", + "\n", + "* **Latency**: How long does it take the agent to respond?\n", + "\n", + "* **Failure Rate**: How often does the agent fail to produce a response?\n", + "\n", + "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e43229f3ad4f" + }, + "source": [ + "### Prepare Agent Evaluation dataset\n", + "\n", + "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", + "\n", + "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", + "\n", + "> Optionally, you can provide both generated responses and predicted trajectory (**Bring-Your-Own-Dataset scenario**).\n", + "\n", + "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fFf8uTdUiDt3" + }, + "outputs": [], + "source": [ + "eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + "}\n", + "\n", + "eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PQEI1EcfvFHb" + }, + "source": [ + "Print some samples from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EjsonqWWvIvE" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(eval_sample_dataset, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m4CvBuf1afHG" + }, + "source": [ + "### Single tool usage evaluation\n", + "\n", + "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_rS5GGKHd5bx" + }, + "source": [ + "#### Set single tool usage metrics\n", + "\n", + "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", + "\n", + "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xixvq8dwd5by" + }, + "outputs": [], + "source": [ + "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ktKZoT2Qd5by" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using agent_parsed_outcome function and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SRv43fDcd5by" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"\n", + "\n", + "single_tool_call_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=single_tool_usage_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(single_tool_call_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6o5BjSTFKVMS" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Use some helper functions to visualize a sample of evaluation result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1Jopzw83k14w" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JlujdJpu5Kn6" + }, + "source": [ + "### Trajectory Evaluation\n", + "\n", + "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8s-nHdDJneHM" + }, + "source": [ + "#### Set trajectory metrics\n", + "\n", + "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", + "\n", + "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", + "\n", + "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", + "\n", + "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", + "\n", + "* `trajectory_precision`: proportion of predicted actions present in reference\n", + "\n", + "* `trajectory_recall`: proportion of reference actions present in predicted. \n", + "\n", + "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c32WIS95neHN" + }, + "outputs": [], + "source": [ + "trajectory_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"trajectory_any_order_match\",\n", + " \"trajectory_precision\",\n", + " \"trajectory_recall\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF3jhTH3neHN" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Submit an evaluation by running `evaluate` method of the new `EvalTask`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vOdS7TJUneHN" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"trajectory-{get_id()}\"\n", + "\n", + "trajectory_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "trajectory_eval_result = trajectory_eval_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(trajectory_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DBiUI3LyLBtj" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Print and visualize a sample of evaluation results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sLVRdN5llA0h" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "erYYZEaaTNjJ" + }, + "outputs": [], + "source": [ + "plot_bar_plot(\n", + " trajectory_eval_result,\n", + " title=\"Trajectory Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T8TipU2akHEd" + }, + "source": [ + "### Evaluate final response\n", + "\n", + "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DeK-py7ykkDN" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", + "\n", + "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cyGHGgeVklvz" + }, + "outputs": [], + "source": [ + "response_metrics = [\"safety\", \"coherence\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DaBJWcg1kn55" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wRb2EC_hknSD" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"response-{get_id()}\"\n", + "\n", + "response_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "response_eval_result = response_eval_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(response_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WOP9hW-rTUIU" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZODTRuq2lF75" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ntRBK3Te6PEc" + }, + "source": [ + "### Evaluate generated response conditioned by tool choosing\n", + "\n", + "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", + "\n", + "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4bENwFcd6prX" + }, + "source": [ + "#### Define a custom metric\n", + "\n", + "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", + "\n", + "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "txGEHcg76riI" + }, + "outputs": [], + "source": [ + "criteria = {\n", + " \"Follows trajectory\": (\n", + " \"Evaluate whether the agent's response logically follows from the \"\n", + " \"sequence of actions it took. Consider these sub-points:\\n\"\n", + " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", + " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", + " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", + " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", + " )\n", + "}\n", + "\n", + "pointwise_rating_rubric = {\n", + " \"1\": \"Follows trajectory\",\n", + " \"0\": \"Does not follow trajectory\",\n", + "}\n", + "\n", + "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", + " criteria=criteria,\n", + " rating_rubric=pointwise_rating_rubric,\n", + " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8MJqXu0kikxd" + }, + "source": [ + "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5EL7iEDMikNQ" + }, + "outputs": [], + "source": [ + "print(response_follows_trajectory_prompt_template.prompt_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e1djVp7Fi4Yy" + }, + "source": [ + "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nx1xbZD87iMj" + }, + "outputs": [], + "source": [ + "response_follows_trajectory_metric = PointwiseMetric(\n", + " metric=\"response_follows_trajectory\",\n", + " metric_prompt_template=response_follows_trajectory_prompt_template,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1pmxLwTe7Ywv" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "Set new generated response evaluation metrics by including the custom metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wrsbVFDd7Ywv" + }, + "outputs": [], + "source": [ + "response_tool_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"safety\",\n", + " response_follows_trajectory_metric,\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lo-Sza807Ywv" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_dkb4gSn7Ywv" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"response-over-tools-{get_id()}\"\n", + "\n", + "response_eval_tool_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "response_eval_tool_result = response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AtOfIFi2j88g" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GH2YvXgLlLH7" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4nuUDP3a2eTB" + }, + "source": [ + "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", + "\n", + "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DRLKlmWd27PK" + }, + "source": [ + "### Bring your own evaluation dataset\n", + "\n", + "Define the evaluation dataset with the predicted trajectory and the generated response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y9hBgsg324Ej" + }, + "outputs": [], + "source": [ + "byod_eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"predicted_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"response\": [\n", + " 500,\n", + " 50,\n", + " \"A super fast and light usb charger\",\n", + " 100,\n", + " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " ],\n", + "}\n", + "\n", + "byod_eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oEYmU2eJ7q-1" + }, + "source": [ + "### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wBD-4wpB7q-3" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", + "\n", + "byod_response_eval_tool_task = EvalTask(\n", + " dataset=byod_eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", + " experiment_run_name=EXPERIMENT_RUN_NAME\n", + ")\n", + "\n", + "display_eval_report(byod_response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9eU3LG6r7q-3" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pQFzmd2I7q-3" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DJr8GqQKTpUa" + }, + "outputs": [], + "source": [ + "display_radar_plot(\n", + " byod_response_eval_tool_result,\n", + " title=\"Agent evaluation metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2a4e033321ad" + }, + "source": [ + "## Cleaning up\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ox2I3UfRlTOd" + }, + "outputs": [], + "source": [ + "delete_experiment = True\n", + "\n", + "if delete_experiment:\n", + " try:\n", + " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", + " experiment.delete(delete_backing_tensorboard_runs=True)\n", + " except Exception as e:\n", + " print(e)" + ] + } + ], + "metadata": { + "colab": { + "name": "evaluating_langgraph_agent.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb b/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb new file mode 100644 index 0000000000..94edbc96e9 --- /dev/null +++ b/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb @@ -0,0 +1,1694 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Evaluate a CrewAI agent on Vertex AI Reasoning Engine (Customized template)\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Google
Open in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Open in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "Share to:\n", + "\n", + "\n", + " \"LinkedIn\n", + "\n", + "\n", + "\n", + " \"Bluesky\n", + "\n", + "\n", + "\n", + " \"X\n", + "\n", + "\n", + "\n", + " \"Reddit\n", + "\n", + "\n", + "\n", + " \"Facebook\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "84f0f73a0f76" + }, + "source": [ + "| | |\n", + "|-|-|\n", + "| Authors | [Naveksha Sood](https://github.com/navekshasood) [Ivan Nardini](https://github.com/inardini) |" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", + "\n", + "This tutorial shows how to evaluate a CrewAI agent with customized template on Vertex AI Reasoning Engine using Vertex AI Gen AI Evaluation.\n", + "\n", + "The tutorial uses the following Google Cloud services and resources:\n", + "\n", + "* Vertex AI Gen AI Evaluation\n", + "* Vertex AI Reasoning Engine\n", + "\n", + "The steps performed include:\n", + "\n", + "* Build and deploy an agent using CrewAI on Vertex AI Reasoning Engine\n", + "* Prepare Agent Evaluation dataset\n", + "* Single tool usage evaluation\n", + "* Trajectory evaluation\n", + "* Response evaluation\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "61RBz8LLbxCR" + }, + "source": [ + "## Get started" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "No17Cw5hgx12" + }, + "source": [ + "### Install Vertex AI SDK and other required packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tFy3H3aPgx12" + }, + "outputs": [], + "source": [ + "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation, reasoningengine]\" \\\n", + " \"crewai\" \"crewai-tools\" \\\n", + " \"cloudpickle==3.0.0\" \\\n", + " \"pydantic==2.7.4\" \\\n", + " \"requests\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R5Xep4W9lq-Z" + }, + "source": [ + "### Restart runtime\n", + "\n", + "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", + "\n", + "The restart might take a minute or longer. After it's restarted, continue to the next step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XRvKdaPDTznN" + }, + "outputs": [], + "source": [ + "import IPython\n", + "\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SbmM4z7FOBpM" + }, + "source": [ + "
\n", + "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dmWOrTJ3gx13" + }, + "source": [ + "### Authenticate your notebook environment (Colab only)\n", + "\n", + "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NyKGtVQjgx13" + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " from google.colab import auth\n", + "\n", + " auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF4l8DTdWgPY" + }, + "source": [ + "### Set Google Cloud project information and initialize Vertex AI SDK\n", + "\n", + "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nqwi-5ufWp_B" + }, + "outputs": [], + "source": [ + "# Use the environment variable if the user doesn't provide Project ID.\n", + "import os\n", + "\n", + "import vertexai\n", + "\n", + "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", + "\n", + "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", + " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", + "\n", + "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", + "\n", + "BUCKET_NAME = \"[your-bucket-name]\" # @param {type: \"string\", placeholder: \"[your-bucket-name]\", isTemplate: true}\n", + "\n", + "if not BUCKET_NAME or BUCKET_NAME == \"[your-bucket-name]\":\n", + " BUCKET_NAME = f\"{PROJECT_ID}-bucket\"\n", + "\n", + "! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI\n", + "\n", + "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", + "\n", + "vertexai.init(\n", + " project=PROJECT_ID,\n", + " location=LOCATION,\n", + " staging_bucket=BUCKET_URI,\n", + " experiment=EXPERIMENT_NAME,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5303c05f7aa6" + }, + "source": [ + "## Import libraries\n", + "\n", + "Import tutorial libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6fc324893334" + }, + "outputs": [], + "source": [ + "# General\n", + "import random\n", + "import string\n", + "\n", + "from IPython.display import HTML, Markdown, display\n", + "\n", + "# Build agent\n", + "from crewai import Agent, Crew, Process, Task\n", + "from crewai.flow.flow import Flow, listen, start\n", + "from crewai_tools import tool\n", + "\n", + "# Evaluate agent\n", + "from google.cloud import aiplatform\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "from vertexai.preview import reasoning_engines\n", + "from vertexai.preview.evaluation import EvalTask\n", + "from vertexai.preview.evaluation.metrics import (\n", + " PointwiseMetric,\n", + " PointwiseMetricPromptTemplate,\n", + " TrajectorySingleToolUse,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MVnBDX54gz7j" + }, + "source": [ + "## Define helper functions\n", + "\n", + "Initiate a set of helper functions to print tutorial results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uSgWjMD_g1_v" + }, + "outputs": [], + "source": [ + "def get_id(length: int = 8) -> str:\n", + " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", + " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", + "\n", + "\n", + "def parse_crewai_output_to_dictionary(crew, crew_output):\n", + " \"\"\"\n", + " Parse CrewAI output into a structured dictionary format.\n", + " \"\"\"\n", + " final_output = {\"response\": str(crew_output), \"predicted_trajectory\": []}\n", + "\n", + " try:\n", + " # Access tools_results directly from each agent\n", + " for agent in crew.agents:\n", + " if hasattr(agent, \"tools_results\"):\n", + " for tool_result in agent.tools_results:\n", + " tool_info = {\n", + " \"tool_name\": tool_result.get(\"tool_name\", \"\"),\n", + " \"tool_input\": tool_result.get(\"tool_args\", {}),\n", + " }\n", + " final_output[\"predicted_trajectory\"].append(tool_info)\n", + "\n", + " except Exception as e:\n", + " final_output[\"error\"] = f\"Error parsing tools results: {str(e)}\"\n", + "\n", + " return final_output\n", + "\n", + "\n", + "def format_output_as_markdown(output: dict) -> str:\n", + " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", + " markdown = \"### AI Response\\n\"\n", + " markdown += f\"{output['response']}\\n\\n\"\n", + "\n", + " if output[\"predicted_trajectory\"]:\n", + " markdown += \"### Function Calls\\n\"\n", + " for call in output[\"predicted_trajectory\"]:\n", + " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", + " markdown += \" - **Arguments**:\\n\"\n", + " for key, value in call[\"tool_input\"].items():\n", + " markdown += f\" - `{key}`: `{value}`\\n\"\n", + "\n", + " return markdown\n", + "\n", + "\n", + "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", + " \"\"\"Display the evaluation results.\"\"\"\n", + " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", + " display(Markdown(\"### Summary Metrics\"))\n", + " display(metrics_df)\n", + "\n", + " display(Markdown(f\"### Row-wise Metrics\"))\n", + " display(eval_result.metrics_table)\n", + "\n", + "\n", + "def display_drilldown(row: pd.Series) -> None:\n", + " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", + "\n", + " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", + "\n", + " if not (\n", + " isinstance(row[\"predicted_trajectory\"], list)\n", + " and isinstance(row[\"reference_trajectory\"], list)\n", + " ):\n", + " return\n", + "\n", + " for predicted_trajectory, reference_trajectory in zip(\n", + " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", + " ):\n", + " display(\n", + " HTML(\n", + " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", + " )\n", + " )\n", + "\n", + " if not (\n", + " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", + " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", + " ):\n", + " continue\n", + "\n", + " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", + " print(\"Tool Input Key: \", tool_input_key)\n", + "\n", + " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " reference_trajectory[\"tool_input\"][tool_input_key],\n", + " )\n", + " else:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " \"N/A\",\n", + " )\n", + " print(\"\\n\")\n", + " display(HTML(\"
\"))\n", + "\n", + "\n", + "def display_dataframe_rows(\n", + " df: pd.DataFrame,\n", + " columns: list[str] | None = None,\n", + " num_rows: int = 3,\n", + " display_drilldown: bool = False,\n", + ") -> None:\n", + " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", + "\n", + " if columns:\n", + " df = df[columns]\n", + "\n", + " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", + " header_style = base_style + \"font-weight: bold;\"\n", + "\n", + " for _, row in df.head(num_rows).iterrows():\n", + " for column in df.columns:\n", + " display(\n", + " HTML(\n", + " f\"{column.replace('_', ' ').title()}: \"\n", + " )\n", + " )\n", + " display(HTML(f\"{row[column]}
\"))\n", + "\n", + " display(HTML(\"
\"))\n", + "\n", + " if (\n", + " display_drilldown\n", + " and \"predicted_trajectory\" in df.columns\n", + " and \"reference_trajectory\" in df.columns\n", + " ):\n", + " display_drilldown(row)\n", + "\n", + "\n", + "def plot_bar_plot(\n", + " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", + ") -> None:\n", + " fig = go.Figure()\n", + " data = []\n", + "\n", + " summary_metrics = eval_result.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " data.append(\n", + " go.Bar(\n", + " x=list(summary_metrics.keys()),\n", + " y=list(summary_metrics.values()),\n", + " name=title,\n", + " )\n", + " )\n", + "\n", + " fig = go.Figure(data=data)\n", + "\n", + " # Change the bar mode\n", + " fig.update_layout(barmode=\"group\")\n", + " fig.show()\n", + "\n", + "\n", + "def display_radar_plot(eval_results, title: str, metrics=None):\n", + " \"\"\"Plot the radar plot.\"\"\"\n", + " fig = go.Figure()\n", + " summary_metrics = eval_results.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " min_val = min(summary_metrics.values())\n", + " max_val = max(summary_metrics.values())\n", + "\n", + " fig.add_trace(\n", + " go.Scatterpolar(\n", + " r=list(summary_metrics.values()),\n", + " theta=list(summary_metrics.keys()),\n", + " fill=\"toself\",\n", + " name=title,\n", + " )\n", + " )\n", + " fig.update_layout(\n", + " title=title,\n", + " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", + " showlegend=True,\n", + " )\n", + " fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bDaa2Mtsifmq" + }, + "source": [ + "## Build an agent using Vertex AI Reasoning Engine's customized template\n", + "\n", + "Build your application using CrewAI, including the Gemini model and custom tools that you define.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KHwShhpOitKp" + }, + "source": [ + "### Set tools\n", + "\n", + "To start, set the tools that a customer support agent needs to do their job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gA2ZKvfeislw" + }, + "outputs": [], + "source": [ + "@tool\n", + "def get_product_details(product_name: str):\n", + " \"\"\"Gathers basic details about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", + " \"usb charger\": \"A super fast and light usb charger\",\n", + " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", + " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", + " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " }\n", + " return details.get(product_name, \"Product details not found.\")\n", + "\n", + "\n", + "@tool\n", + "def get_product_price(product_name: str):\n", + " \"\"\"Gathers price about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": 500,\n", + " \"usb charger\": 10,\n", + " \"shoes\": 100,\n", + " \"headphones\": 50,\n", + " \"speaker\": 80,\n", + " }\n", + " return details.get(product_name, \"Product price not found.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "be70714d9fae" + }, + "source": [ + "### Define router using Flow\n", + "\n", + "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "516b5108d327" + }, + "outputs": [], + "source": [ + "class ProductFlow(Flow):\n", + " @start\n", + " def begin_flow(self):\n", + " \"\"\"Starts the product information flow\"\"\"\n", + " return \"check_request\"\n", + "\n", + " @listen(\"check_request\")\n", + " def router(self, state: dict) -> str:\n", + " \"\"\"Routes the product request to appropriate handler\"\"\"\n", + " # Get the last message from the state\n", + " last_message = state.get(\"last_message\", {})\n", + " tool_calls = last_message.get(\"tool_calls\", [])\n", + "\n", + " if tool_calls:\n", + " function_name = tool_calls[0].get(\"name\")\n", + " if function_name == \"get_product_price\":\n", + " return \"get_product_price\"\n", + " else:\n", + " return \"get_product_details\"\n", + " return \"end\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FHjhBVx2cHWb" + }, + "source": [ + "### Set the model\n", + "\n", + "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iCx9hbpccHWc" + }, + "outputs": [], + "source": [ + "model = \"vertex_ai/gemini-1.5-pro-002\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNlAY9cojEWz" + }, + "source": [ + "### Assemble the agent\n", + "\n", + "To create a CrewAI agent using [Vertex AI Reasoning Engine](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy), use the [customized template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize).\n", + "\n", + "This class helps you quickly get an agent using any framework running Vertex AI Reasoning Engine.\n", + "\n", + "To learn more about the template, check out [Customize an application template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize) documentation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dAFdi7SujGP8" + }, + "outputs": [], + "source": [ + "class CrewAIApp:\n", + " def __init__(self, project: str, location: str, model: str = model) -> None:\n", + " self.project_id = project\n", + " self.location = location\n", + " self.model = model\n", + "\n", + " # The set_up method is used to define application initialization logic\n", + " def set_up(self) -> None:\n", + " \"\"\"Set up the application.\"\"\"\n", + " os.environ[\"GOOGLE_CLOUD_PROJECT\"] = self.project_id\n", + " return\n", + "\n", + " # The query method will be used to send inputs to the agent\n", + " def query(self, input: str):\n", + " \"\"\"Query the application.\"\"\"\n", + " product_researcher = Agent(\n", + " role=\"Product Researcher\",\n", + " goal=\"Research product details and prices accurately\",\n", + " backstory=\"Expert at gathering and analyzing product information\",\n", + " llm=model,\n", + " tools=[get_product_details, get_product_price],\n", + " allow_delegation=False,\n", + " )\n", + "\n", + " research_task = Task(\n", + " description=f\"Analyze this user request: '{input}'. \"\n", + " f\"If the request is about price, use get_product_price tool. \"\n", + " f\"Otherwise, use get_product_details tool to get product information.\",\n", + " expected_output=\"Product information including details and/or price based on the user request.\",\n", + " agent=product_researcher,\n", + " )\n", + "\n", + " crew = Crew(\n", + " agents=[product_researcher],\n", + " tasks=[research_task],\n", + " process=Process.sequential,\n", + " )\n", + "\n", + " result = crew.kickoff()\n", + " return parse_crewai_output_to_dictionary(crew, result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_HGcs6PVjRj_" + }, + "source": [ + "### Test the local agent\n", + "\n", + "Query your agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1dXLLgBudu_L" + }, + "outputs": [], + "source": [ + "local_custom_agent = CrewAIApp(project=PROJECT_ID, location=LOCATION)\n", + "local_custom_agent.set_up()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PgkOhPmN3aCZ" + }, + "outputs": [], + "source": [ + "response = local_custom_agent.query(input=\"Get product details for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lGb58OJkjUs9" + }, + "outputs": [], + "source": [ + "response = local_custom_agent.query(input=\"Get product price for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2pSItXD5e4QD" + }, + "source": [ + "### Deploy the local agent to Vertex AI Reasoning Engine\n", + "\n", + "To deploy the local agent on Vertex AI Reasoning Engine, you can use the `create` method by passing the agent and some specify dependencies (`requirements` for external PyPI packages and `extra_packages` for local packages ).\n", + "\n", + "Look at [Deploy the application](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy#create_a_reasoningengine_instance) documentation page to learn more. \n", + "\n", + "> The agent deployment on Vertex AI Reasoning Engine would require ~ 10 mins." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3HLz_a1We4QE" + }, + "outputs": [], + "source": [ + "local_custom_agent = CrewAIApp(project=PROJECT_ID, location=LOCATION)\n", + "\n", + "remote_custom_agent = reasoning_engines.ReasoningEngine.create(\n", + " local_custom_agent,\n", + " requirements=[\n", + " \"google-cloud-aiplatform[reasoningengine]\",\n", + " \"crewai\",\n", + " \"crewai-tools\",\n", + " \"cloudpickle==3.0.0\",\n", + " \"pydantic==2.7.4\",\n", + " \"requests\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nu4RO1P9e4QE" + }, + "source": [ + "### Test the remote agent\n", + "\n", + "Query your remote agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sqBtzYyce4QE" + }, + "outputs": [], + "source": [ + "response = remote_custom_agent.query(input=\"Get product details for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aOGPePsorpUl" + }, + "source": [ + "## Evaluating a custom agent with Vertex AI Gen AI Evaluation\n", + "\n", + "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", + "\n", + "Monitoring focuses on how well your agent is performing specific tasks:\n", + "\n", + "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", + "\n", + "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", + "\n", + "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", + "\n", + "Observability is about understanding the overall health of the agent:\n", + "\n", + "* **Latency**: How long does it take the agent to respond?\n", + "\n", + "* **Failure Rate**: How often does the agent fail to produce a response?\n", + "\n", + "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e43229f3ad4f" + }, + "source": [ + "### Prepare Agent Evaluation dataset\n", + "\n", + "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", + "\n", + "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", + "\n", + "\n", + "> Optionally, you can provide both generated responses and predicted trajectory (**bring-your-own-dataset scenario**).\n", + "\n", + "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fFf8uTdUiDt3" + }, + "outputs": [], + "source": [ + "eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + "}\n", + "\n", + "eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PQEI1EcfvFHb" + }, + "source": [ + "Print some samples from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EjsonqWWvIvE" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(eval_sample_dataset, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "htCrOS9fRVi8" + }, + "source": [ + "### Prepare an Agent function\n", + "\n", + "In this scenario with a custom agent, you need an agent function to pass the agent output to Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GdO56MIDRZri" + }, + "outputs": [], + "source": [ + "def agent_parsed_response(input: str) -> dict:\n", + " \"\"\"Pass the agent output to Vertex AI Gen AI Evaluation.\"\"\"\n", + "\n", + " result = remote_custom_agent.query(input=input)\n", + "\n", + " return result" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m4CvBuf1afHG" + }, + "source": [ + "### Single tool usage evaluation\n", + "\n", + "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_rS5GGKHd5bx" + }, + "source": [ + "#### Set single tool usage metrics\n", + "\n", + "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", + "\n", + "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xixvq8dwd5by" + }, + "outputs": [], + "source": [ + "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ktKZoT2Qd5by" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using a local agent (local_1p_agent) and assigns a unique identifier to this specific evaluation run, storing the evaluation results.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QaMf9dqzySE6" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SRv43fDcd5by" + }, + "outputs": [], + "source": [ + "single_tool_call_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=single_tool_usage_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", + " runnable=agent_parsed_response, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(single_tool_call_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6o5BjSTFKVMS" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Use some helper functions to visualize a sample of evaluation result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZkpwPReipekr" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JlujdJpu5Kn6" + }, + "source": [ + "### Trajectory Evaluation\n", + "\n", + "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8s-nHdDJneHM" + }, + "source": [ + "#### Set trajectory metrics\n", + "\n", + "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", + "\n", + "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", + "\n", + "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", + "\n", + "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", + "\n", + "* `trajectory_precision`: proportion of predicted actions present in reference\n", + "\n", + "* `trajectory_recall`: proportion of reference actions present in predicted. \n", + "\n", + "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c32WIS95neHN" + }, + "outputs": [], + "source": [ + "trajectory_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"trajectory_any_order_match\",\n", + " \"trajectory_precision\",\n", + " \"trajectory_recall\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF3jhTH3neHN" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Submit an evaluation by running `evaluate` method of the new `EvalTask`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vOdS7TJUneHN" + }, + "outputs": [], + "source": [ + "EXPERIMENT_NAME = \"evaluate-re-agent-trajectory\" # @param {type:\"string\"}\n", + "\n", + "trajectory_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=agent_parsed_response)\n", + "\n", + "display_eval_report(trajectory_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DBiUI3LyLBtj" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Print and visualize a sample of evaluation results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sLVRdN5llA0h" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PrxM5sMZYXHP" + }, + "outputs": [], + "source": [ + "plot_bar_plot(\n", + " trajectory_eval_result,\n", + " title=\"Trajectory Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T8TipU2akHEd" + }, + "source": [ + "### Evaluate final response\n", + "\n", + "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DeK-py7ykkDN" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", + "\n", + "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cyGHGgeVklvz" + }, + "outputs": [], + "source": [ + "response_metrics = [\"safety\", \"coherence\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DaBJWcg1kn55" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wRb2EC_hknSD" + }, + "outputs": [], + "source": [ + "EXPERIMENT_NAME = \"evaluate-re-agent-response\" # @param {type:\"string\"}\n", + "\n", + "response_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "response_eval_result = response_eval_task.evaluate(runnable=agent_parsed_response)\n", + "\n", + "display_eval_report(response_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JtewTwiwg9qH" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cy0aRydrp9zW" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ntRBK3Te6PEc" + }, + "source": [ + "### Evaluate generated response conditioned by tool choosing\n", + "\n", + "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", + "\n", + "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4bENwFcd6prX" + }, + "source": [ + "#### Define a custom metric\n", + "\n", + "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", + "\n", + "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "txGEHcg76riI" + }, + "outputs": [], + "source": [ + "criteria = {\n", + " \"Follows trajectory\": (\n", + " \"Evaluate whether the agent's response logically follows from the \"\n", + " \"sequence of actions it took. Consider these sub-points:\\n\"\n", + " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", + " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", + " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", + " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", + " )\n", + "}\n", + "\n", + "pointwise_rating_rubric = {\n", + " \"1\": \"Follows trajectory\",\n", + " \"0\": \"Does not follow trajectory\",\n", + "}\n", + "\n", + "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", + " criteria=criteria,\n", + " rating_rubric=pointwise_rating_rubric,\n", + " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8MJqXu0kikxd" + }, + "source": [ + "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5EL7iEDMikNQ" + }, + "outputs": [], + "source": [ + "print(response_follows_trajectory_prompt_template.prompt_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e1djVp7Fi4Yy" + }, + "source": [ + "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nx1xbZD87iMj" + }, + "outputs": [], + "source": [ + "response_follows_trajectory_metric = PointwiseMetric(\n", + " metric=\"response_follows_trajectory\",\n", + " metric_prompt_template=response_follows_trajectory_prompt_template,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1pmxLwTe7Ywv" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "Set new generated response evaluation metrics by including the custom metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wrsbVFDd7Ywv" + }, + "outputs": [], + "source": [ + "response_tool_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"safety\",\n", + " response_follows_trajectory_metric,\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lo-Sza807Ywv" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_dkb4gSn7Ywv" + }, + "outputs": [], + "source": [ + "EXPERIMENT_NAME = \"evaluate-re-agent-response-by-tools\" # @param {type:\"string\"}\n", + "\n", + "response_eval_tool_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "response_eval_tool_result = response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_response\n", + ")\n", + "\n", + "display_eval_report(response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EFmnRBlWqJnC" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZODTRuq2lF75" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4nuUDP3a2eTB" + }, + "source": [ + "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", + "\n", + "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pNh3-NDuZGDl" + }, + "source": [ + "### Bring your own evaluation dataset\n", + "\n", + "Define the evaluation dataset with the predicted trajectory and the generated response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y9hBgsg324Ej" + }, + "outputs": [], + "source": [ + "byod_eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"predicted_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"response\": [\n", + " 500,\n", + " 50,\n", + " \"A super fast and light usb charger\",\n", + " 100,\n", + " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " ],\n", + "}\n", + "\n", + "byod_eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oEYmU2eJ7q-1" + }, + "source": [ + "### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wBD-4wpB7q-3" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", + "\n", + "byod_response_eval_tool_task = EvalTask(\n", + " dataset=byod_eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", + " experiment_run_name=EXPERIMENT_RUN_NAME\n", + ")\n", + "\n", + "display_eval_report(byod_response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9eU3LG6r7q-3" + }, + "source": [ + "### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pQFzmd2I7q-3" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0FEbvEOkZS8f" + }, + "outputs": [], + "source": [ + "display_radar_plot(\n", + " byod_response_eval_tool_result,\n", + " title=\"Agent evaluation metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2a4e033321ad" + }, + "source": [ + "## Cleaning up\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ox2I3UfRlTOd" + }, + "outputs": [], + "source": [ + "delete_experiment = True\n", + "delete_remote_agent = True\n", + "\n", + "if delete_experiment:\n", + " try:\n", + " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", + " experiment.delete(delete_backing_tensorboard_runs=True)\n", + " except Exception as e:\n", + " print(e)\n", + "\n", + "if delete_remote_agent:\n", + " try:\n", + " remote_custom_agent.delete()\n", + " except Exception as e:\n", + " print(e)" + ] + } + ], + "metadata": { + "colab": { + "name": "evaluating_crewai_agent_reasoning_engine_customized_template.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb b/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb new file mode 100644 index 0000000000..bd48074f2a --- /dev/null +++ b/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb @@ -0,0 +1,1528 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Evaluating a LangChain Agent on Vertex AI Reasoning Engine (Prebuilt template)\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Google
Open in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Open in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "Share to:\n", + "\n", + "\n", + " \"LinkedIn\n", + "\n", + "\n", + "\n", + " \"Bluesky\n", + "\n", + "\n", + "\n", + " \"X\n", + "\n", + "\n", + "\n", + " \"Reddit\n", + "\n", + "\n", + "\n", + " \"Facebook\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "84f0f73a0f76" + }, + "source": [ + "| | |\n", + "|-|-|\n", + "| Authors | [Naveksha Sood](https://github.com/navekshasood) [Ivan Nardini](https://github.com/inardini) |" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", + "\n", + "This tutorial shows how to evaluate a first-party Reasoning Engine Agent using Vertex AI Gen AI Evaluation for agent evaluation.\n", + "\n", + "The tutorial uses the following Google Cloud services and resources:\n", + "\n", + "* Vertex AI Gen AI Evaluation\n", + "* Vertex AI Reasoning Engine\n", + "\n", + "The steps performed include:\n", + "\n", + "* Build and deploy an agent using LangChain\n", + "* Prepare Agent Evaluation dataset\n", + "* Single tool usage evaluation\n", + "* Trajectory evaluation\n", + "* Response evaluation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "61RBz8LLbxCR" + }, + "source": [ + "## Get started" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "No17Cw5hgx12" + }, + "source": [ + "### Install Vertex AI SDK and other required packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tFy3H3aPgx12" + }, + "outputs": [], + "source": [ + "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation, langchain, reasoningengine]\" \\\n", + " \"langchain_google_vertexai\" \\\n", + " \"cloudpickle==3.0.0\" \\\n", + " \"pydantic==2.7.4\" \\\n", + " \"requests\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R5Xep4W9lq-Z" + }, + "source": [ + "### Restart runtime\n", + "\n", + "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", + "\n", + "The restart might take a minute or longer. After it's restarted, continue to the next step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XRvKdaPDTznN" + }, + "outputs": [], + "source": [ + "import IPython\n", + "\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SbmM4z7FOBpM" + }, + "source": [ + "
\n", + "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dmWOrTJ3gx13" + }, + "source": [ + "### Authenticate your notebook environment (Colab only)\n", + "\n", + "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NyKGtVQjgx13" + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " from google.colab import auth\n", + "\n", + " auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF4l8DTdWgPY" + }, + "source": [ + "### Set Google Cloud project information and initialize Vertex AI SDK\n", + "\n", + "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nqwi-5ufWp_B" + }, + "outputs": [], + "source": [ + "# Use the environment variable if the user doesn't provide Project ID.\n", + "import os\n", + "\n", + "import vertexai\n", + "\n", + "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", + "\n", + "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", + " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", + "\n", + "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", + "\n", + "BUCKET_NAME = \"[your-bucket-name]\" # @param {type: \"string\", placeholder: \"[your-bucket-name]\", isTemplate: true}\n", + "\n", + "if not BUCKET_NAME or BUCKET_NAME == \"[your-bucket-name]\":\n", + " BUCKET_NAME = f\"{PROJECT_ID}-bucket\"\n", + "\n", + "! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI\n", + "\n", + "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", + "\n", + "vertexai.init(\n", + " project=PROJECT_ID,\n", + " location=LOCATION,\n", + " staging_bucket=BUCKET_URI,\n", + " experiment=EXPERIMENT_NAME,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5303c05f7aa6" + }, + "source": [ + "## Import libraries\n", + "\n", + "Import tutorial libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6fc324893334" + }, + "outputs": [], + "source": [ + "# General\n", + "import random\n", + "import string\n", + "\n", + "from IPython.display import HTML, Markdown, display\n", + "\n", + "# Build agent\n", + "from google.cloud import aiplatform\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "from vertexai.preview import reasoning_engines\n", + "\n", + "# Evaluate agent\n", + "from vertexai.preview.evaluation import EvalTask\n", + "from vertexai.preview.evaluation.metrics import (\n", + " PointwiseMetric,\n", + " PointwiseMetricPromptTemplate,\n", + " TrajectorySingleToolUse,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MVnBDX54gz7j" + }, + "source": [ + "## Define helper functions\n", + "\n", + "Initiate a set of helper functions to print tutorial results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uSgWjMD_g1_v" + }, + "outputs": [], + "source": [ + "def get_id(length: int = 8) -> str:\n", + " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", + " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", + "\n", + "\n", + "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", + " \"\"\"Display the evaluation results.\"\"\"\n", + " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", + " display(Markdown(\"### Summary Metrics\"))\n", + " display(metrics_df)\n", + "\n", + " display(Markdown(f\"### Row-wise Metrics\"))\n", + " display(eval_result.metrics_table)\n", + "\n", + "\n", + "def display_drilldown(row: pd.Series) -> None:\n", + " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", + "\n", + " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", + "\n", + " if not (\n", + " isinstance(row[\"predicted_trajectory\"], list)\n", + " and isinstance(row[\"reference_trajectory\"], list)\n", + " ):\n", + " return\n", + "\n", + " for predicted_trajectory, reference_trajectory in zip(\n", + " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", + " ):\n", + " display(\n", + " HTML(\n", + " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", + " )\n", + " )\n", + "\n", + " if not (\n", + " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", + " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", + " ):\n", + " continue\n", + "\n", + " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", + " print(\"Tool Input Key: \", tool_input_key)\n", + "\n", + " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " reference_trajectory[\"tool_input\"][tool_input_key],\n", + " )\n", + " else:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " \"N/A\",\n", + " )\n", + " print(\"\\n\")\n", + " display(HTML(\"
\"))\n", + "\n", + "\n", + "def display_dataframe_rows(\n", + " df: pd.DataFrame,\n", + " columns: list[str] | None = None,\n", + " num_rows: int = 3,\n", + " display_drilldown: bool = False,\n", + ") -> None:\n", + " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", + "\n", + " if columns:\n", + " df = df[columns]\n", + "\n", + " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", + " header_style = base_style + \"font-weight: bold;\"\n", + "\n", + " for _, row in df.head(num_rows).iterrows():\n", + " for column in df.columns:\n", + " display(\n", + " HTML(\n", + " f\"{column.replace('_', ' ').title()}: \"\n", + " )\n", + " )\n", + " display(HTML(f\"{row[column]}
\"))\n", + "\n", + " display(HTML(\"
\"))\n", + "\n", + " if (\n", + " display_drilldown\n", + " and \"predicted_trajectory\" in df.columns\n", + " and \"reference_trajectory\" in df.columns\n", + " ):\n", + " display_drilldown(row)\n", + "\n", + "\n", + "def plot_bar_plot(\n", + " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", + ") -> None:\n", + " fig = go.Figure()\n", + " data = []\n", + "\n", + " summary_metrics = eval_result.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " data.append(\n", + " go.Bar(\n", + " x=list(summary_metrics.keys()),\n", + " y=list(summary_metrics.values()),\n", + " name=title,\n", + " )\n", + " )\n", + "\n", + " fig = go.Figure(data=data)\n", + "\n", + " # Change the bar mode\n", + " fig.update_layout(barmode=\"group\")\n", + " fig.show()\n", + "\n", + "\n", + "def display_radar_plot(eval_results, title: str, metrics=None):\n", + " \"\"\"Plot the radar plot.\"\"\"\n", + " fig = go.Figure()\n", + " summary_metrics = eval_results.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " min_val = min(summary_metrics.values())\n", + " max_val = max(summary_metrics.values())\n", + "\n", + " fig.add_trace(\n", + " go.Scatterpolar(\n", + " r=list(summary_metrics.values()),\n", + " theta=list(summary_metrics.keys()),\n", + " fill=\"toself\",\n", + " name=title,\n", + " )\n", + " )\n", + " fig.update_layout(\n", + " title=title,\n", + " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", + " showlegend=True,\n", + " )\n", + " fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bDaa2Mtsifmq" + }, + "source": [ + "## Build and deploy a LangChain agent using Vertex AI Reasoning Engine's prebuilt template\n", + "\n", + "Build and deploy your application using LangChain, including the Gemini model and custom tools that you define.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KHwShhpOitKp" + }, + "source": [ + "### Set tools\n", + "\n", + "To start, create some tools the agent will need to do their job. We are just going to pretend there's a database for this Colab, but you would wire into your database or third party system for a real agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gA2ZKvfeislw" + }, + "outputs": [], + "source": [ + "def get_product_details(product_name: str):\n", + " \"\"\"Gathers basic details about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", + " \"usb charger\": \"A super fast and light usb charger\",\n", + " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", + " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", + " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " }\n", + " return details.get(product_name, \"Product details not found.\")\n", + "\n", + "\n", + "def get_product_price(product_name: str):\n", + " \"\"\"Gathers price about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": 500,\n", + " \"usb charger\": 10,\n", + " \"shoes\": 100,\n", + " \"headphones\": 50,\n", + " \"speaker\": 80,\n", + " }\n", + " return details.get(product_name, \"Product price not found.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l4mk5XPui4Y1" + }, + "source": [ + "### Set the model\n", + "\n", + "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BaYeo6K2i-w1" + }, + "outputs": [], + "source": [ + "model = \"gemini-1.5-pro\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNlAY9cojEWz" + }, + "source": [ + "### Assemble the agent\n", + "\n", + "To create a LangChain agent using [Vertex AI Reasoning Engine](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy), use the LangchainAgent class. This class helps you quickly get an agent running with a standard template. Think of it as a shortcut for building agents – you don't have to start from scratch. The LangchainAgent handles the basic structure and initial configuration, allowing you to get right into using the agent.\n", + "\n", + "> Note the additional parameter `agent_executor_kwargs` which would allow to return tool calls made by agent so you can evaluate them.\n", + "\n", + "The Vertex AI Gen AI Evaluation works directly with 'Queryable' agents (like in this case), and also lets you add your own custom functions with a specific structure (signature)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dAFdi7SujGP8" + }, + "outputs": [], + "source": [ + "local_1p_agent = reasoning_engines.LangchainAgent(\n", + " model=model,\n", + " tools=[get_product_details, get_product_price],\n", + " agent_executor_kwargs={\"return_intermediate_steps\": True},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_HGcs6PVjRj_" + }, + "source": [ + "### Test the local agent\n", + "\n", + "Query your agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lGb58OJkjUs9" + }, + "outputs": [], + "source": [ + "response = local_1p_agent.query(input=\"Get product details for shoes\")\n", + "display(Markdown(response[\"output\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "INqf60zPWP6L" + }, + "outputs": [], + "source": [ + "response = local_1p_agent.query(input=\"Get product price for shoes\")\n", + "display(Markdown(response[\"output\"]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dP5g16W1rzMI" + }, + "source": [ + "### Deploy the local agent to Vertex AI Reasoning Engine\n", + "\n", + "To deploy the local agent on Vertex AI Reasoning Engine, you can use the `create` method by passing the agent and some specify dependencies (`requirements` for external PyPI packages and `extra_packages` for local packages ).\n", + "\n", + "Look at [Deploy the application](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy#create_a_reasoningengine_instance) documentation page to learn more. \n", + "\n", + "> The agent deployment on Vertex AI Reasoning Engine would require ~ 10 mins." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GPNpD676r6T2" + }, + "outputs": [], + "source": [ + "remote_1p_agent = reasoning_engines.ReasoningEngine.create(\n", + " local_1p_agent,\n", + " requirements=[\n", + " \"google-cloud-aiplatform[langchain,reasoningengine]\",\n", + " \"langchain_google_vertexai\",\n", + " \"cloudpickle==3.0.0\",\n", + " \"pydantic==2.7.4\",\n", + " \"requests\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GjZMd82vHRh3" + }, + "source": [ + "### Test the remote agent\n", + "\n", + "Query your remote agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KSCznbhbHRh3" + }, + "outputs": [], + "source": [ + "response = remote_1p_agent.query(input=\"Get product details for shoes\")\n", + "display(Markdown(response[\"output\"]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aOGPePsorpUl" + }, + "source": [ + "## Evaluating an agent with Vertex AI Gen AI Evaluation\n", + "\n", + "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", + "\n", + "Monitoring focuses on how well your agent is performing specific tasks:\n", + "\n", + "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", + "\n", + "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", + "\n", + "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", + "\n", + "Observability is about understanding the overall health of the agent:\n", + "\n", + "* **Latency**: How long does it take the agent to respond?\n", + "\n", + "* **Failure Rate**: How often does the agent fail to produce a response?\n", + "\n", + "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e43229f3ad4f" + }, + "source": [ + "### Prepare Agent Evaluation dataset\n", + "\n", + "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", + "\n", + "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", + "\n", + "> Optionally, you can provide both generated responses and predicted trajectory (**bring-your-own-dataset scenario**).\n", + "\n", + "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fFf8uTdUiDt3" + }, + "outputs": [], + "source": [ + "eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + "}\n", + "\n", + "eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PQEI1EcfvFHb" + }, + "source": [ + "Print some samples from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EjsonqWWvIvE" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(eval_sample_dataset, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m4CvBuf1afHG" + }, + "source": [ + "### Single tool usage evaluation\n", + "\n", + "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_rS5GGKHd5bx" + }, + "source": [ + "#### Set single tool usage metrics\n", + "\n", + "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", + "\n", + "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xixvq8dwd5by" + }, + "outputs": [], + "source": [ + "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ktKZoT2Qd5by" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using the remote agent and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QaMf9dqzySE6" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SRv43fDcd5by" + }, + "outputs": [], + "source": [ + "single_tool_call_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=single_tool_usage_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", + " runnable=remote_1p_agent, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(single_tool_call_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6o5BjSTFKVMS" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Use some helper functions to visualize a sample of evaluation result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1Jopzw83k14w" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JlujdJpu5Kn6" + }, + "source": [ + "### Trajectory Evaluation\n", + "\n", + "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8s-nHdDJneHM" + }, + "source": [ + "#### Set trajectory metrics\n", + "\n", + "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", + "\n", + "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", + "\n", + "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", + "\n", + "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", + "\n", + "* `trajectory_precision`: proportion of predicted actions present in reference\n", + "\n", + "* `trajectory_recall`: proportion of reference actions present in predicted. \n", + "\n", + "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c32WIS95neHN" + }, + "outputs": [], + "source": [ + "trajectory_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"trajectory_any_order_match\",\n", + " \"trajectory_precision\",\n", + " \"trajectory_recall\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF3jhTH3neHN" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Submit an evaluation by running `evaluate` method of the new `EvalTask`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vOdS7TJUneHN" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"trajectory-{get_id()}\"\n", + "\n", + "trajectory_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=remote_1p_agent)\n", + "\n", + "display_eval_report(trajectory_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DBiUI3LyLBtj" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Print and visualize a sample of evaluation results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sLVRdN5llA0h" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PrxM5sMZYXHP" + }, + "outputs": [], + "source": [ + "plot_bar_plot(\n", + " trajectory_eval_result,\n", + " title=\"Trajectory Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T8TipU2akHEd" + }, + "source": [ + "### Evaluate final response\n", + "\n", + "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DeK-py7ykkDN" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", + "\n", + "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cyGHGgeVklvz" + }, + "outputs": [], + "source": [ + "response_metrics = [\"safety\", \"coherence\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DaBJWcg1kn55" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wRb2EC_hknSD" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"response-{get_id()}\"\n", + "\n", + "response_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "response_eval_result = response_eval_task.evaluate(runnable=remote_1p_agent)\n", + "\n", + "display_eval_report(response_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JtewTwiwg9qH" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZODTRuq2lF75" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ntRBK3Te6PEc" + }, + "source": [ + "### Evaluate generated response conditioned by tool choosing.\n", + "\n", + "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", + "\n", + "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4bENwFcd6prX" + }, + "source": [ + "#### Define a custom metric\n", + "\n", + "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", + "\n", + "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "txGEHcg76riI" + }, + "outputs": [], + "source": [ + "criteria = {\n", + " \"Follows trajectory\": (\n", + " \"Evaluate whether the agent's response logically follows from the \"\n", + " \"sequence of actions it took. Consider these sub-points:\\n\"\n", + " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", + " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", + " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", + " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", + " )\n", + "}\n", + "\n", + "pointwise_rating_rubric = {\n", + " \"1\": \"Follows trajectory\",\n", + " \"0\": \"Does not follow trajectory\",\n", + "}\n", + "\n", + "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", + " criteria=criteria,\n", + " rating_rubric=pointwise_rating_rubric,\n", + " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8MJqXu0kikxd" + }, + "source": [ + "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5EL7iEDMikNQ" + }, + "outputs": [], + "source": [ + "print(response_follows_trajectory_prompt_template.prompt_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e1djVp7Fi4Yy" + }, + "source": [ + "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nx1xbZD87iMj" + }, + "outputs": [], + "source": [ + "response_follows_trajectory_metric = PointwiseMetric(\n", + " metric=\"response_follows_trajectory\",\n", + " metric_prompt_template=response_follows_trajectory_prompt_template,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1pmxLwTe7Ywv" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "Set new generated response evaluation metrics by including the custom metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wrsbVFDd7Ywv" + }, + "outputs": [], + "source": [ + "response_tool_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"safety\",\n", + " response_follows_trajectory_metric,\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lo-Sza807Ywv" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_dkb4gSn7Ywv" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"response-over-tools-{get_id()}\"\n", + "\n", + "response_eval_tool_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "response_eval_tool_result = response_eval_tool_task.evaluate(runnable=remote_1p_agent)\n", + "\n", + "display_eval_report(response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AtOfIFi2j88g" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GH2YvXgLlLH7" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4nuUDP3a2eTB" + }, + "source": [ + "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", + "\n", + "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pNh3-NDuZGDl" + }, + "source": [ + "### Bring your own evaluation dataset\n", + "\n", + "Define the evaluation dataset with the predicted trajectory and the generated response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y9hBgsg324Ej" + }, + "outputs": [], + "source": [ + "byod_eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"predicted_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"response\": [\n", + " 500,\n", + " 50,\n", + " \"A super fast and light usb charger\",\n", + " 100,\n", + " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " ],\n", + "}\n", + "\n", + "byod_eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oEYmU2eJ7q-1" + }, + "source": [ + "### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wBD-4wpB7q-3" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", + "\n", + "byod_response_eval_tool_task = EvalTask(\n", + " dataset=byod_eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", + " experiment_run_name=EXPERIMENT_RUN_NAME\n", + ")\n", + "\n", + "display_eval_report(byod_response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9eU3LG6r7q-3" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pQFzmd2I7q-3" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0FEbvEOkZS8f" + }, + "outputs": [], + "source": [ + "display_radar_plot(\n", + " byod_response_eval_tool_result,\n", + " title=\"Agent evaluation metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2a4e033321ad" + }, + "source": [ + "## Cleaning up\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ox2I3UfRlTOd" + }, + "outputs": [], + "source": [ + "delete_experiment = True\n", + "delete_remote_agent = True\n", + "\n", + "if delete_experiment:\n", + " try:\n", + " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", + " experiment.delete(delete_backing_tensorboard_runs=True)\n", + " except Exception as e:\n", + " print(e)\n", + "\n", + "if delete_remote_agent:\n", + " try:\n", + " remote_1p_agent.delete()\n", + " except Exception as e:\n", + " print(e)" + ] + } + ], + "metadata": { + "colab": { + "name": "evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb b/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb new file mode 100644 index 0000000000..5462940dd3 --- /dev/null +++ b/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb @@ -0,0 +1,1691 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Evaluate a LangGraph agent on Vertex AI Reasoning Engine (Customized template)\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Google
Open in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Open in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "Share to:\n", + "\n", + "\n", + " \"LinkedIn\n", + "\n", + "\n", + "\n", + " \"Bluesky\n", + "\n", + "\n", + "\n", + " \"X\n", + "\n", + "\n", + "\n", + " \"Reddit\n", + "\n", + "\n", + "\n", + " \"Facebook\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "84f0f73a0f76" + }, + "source": [ + "| | |\n", + "|-|-|\n", + "| Authors | [Naveksha Sood](https://github.com/navekshasood) [Ivan Nardini](https://github.com/inardini) |" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", + "\n", + "This tutorial shows how to evaluate a LangGraph agent with customized template on Vertex AI Reasoning Engine using Vertex AI Gen AI Evaluation.\n", + "\n", + "The tutorial uses the following Google Cloud services and resources:\n", + "\n", + "* Vertex AI Gen AI Evaluation\n", + "* Vertex AI Reasoning Engine\n", + "\n", + "The steps performed include:\n", + "\n", + "* Build and deploy an agent using LangGraph on Vertex AI Reasoning Engine\n", + "* Prepare Agent Evaluation dataset\n", + "* Single tool usage evaluation\n", + "* Trajectory evaluation\n", + "* Response evaluation\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "61RBz8LLbxCR" + }, + "source": [ + "## Get started" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "No17Cw5hgx12" + }, + "source": [ + "### Install Vertex AI SDK and other required packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tFy3H3aPgx12" + }, + "outputs": [], + "source": [ + "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation, langchain, reasoningengine]\" \\\n", + " \"langchain_google_vertexai\" \\\n", + " \"langgraph\" \\\n", + " \"cloudpickle==3.0.0\" \\\n", + " \"pydantic==2.7.4\" \\\n", + " \"requests\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R5Xep4W9lq-Z" + }, + "source": [ + "### Restart runtime\n", + "\n", + "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", + "\n", + "The restart might take a minute or longer. After it's restarted, continue to the next step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XRvKdaPDTznN" + }, + "outputs": [], + "source": [ + "import IPython\n", + "\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SbmM4z7FOBpM" + }, + "source": [ + "
\n", + "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dmWOrTJ3gx13" + }, + "source": [ + "### Authenticate your notebook environment (Colab only)\n", + "\n", + "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NyKGtVQjgx13" + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " from google.colab import auth\n", + "\n", + " auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF4l8DTdWgPY" + }, + "source": [ + "### Set Google Cloud project information and initialize Vertex AI SDK\n", + "\n", + "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nqwi-5ufWp_B" + }, + "outputs": [], + "source": [ + "# Use the environment variable if the user doesn't provide Project ID.\n", + "import os\n", + "\n", + "import vertexai\n", + "\n", + "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", + "\n", + "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", + " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", + "\n", + "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", + "\n", + "BUCKET_NAME = \"[your-bucket-name]\" # @param {type: \"string\", placeholder: \"[your-bucket-name]\", isTemplate: true}\n", + "\n", + "if not BUCKET_NAME or BUCKET_NAME == \"[your-bucket-name]\":\n", + " BUCKET_NAME = f\"{PROJECT_ID}-bucket\"\n", + "\n", + "! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI\n", + "\n", + "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", + "\n", + "vertexai.init(\n", + " project=PROJECT_ID,\n", + " location=LOCATION,\n", + " staging_bucket=BUCKET_URI,\n", + " experiment=EXPERIMENT_NAME,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5303c05f7aa6" + }, + "source": [ + "## Import libraries\n", + "\n", + "Import tutorial libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6fc324893334" + }, + "outputs": [], + "source": [ + "# General\n", + "import random\n", + "import string\n", + "from typing import Literal\n", + "\n", + "from IPython.display import HTML, Markdown, display\n", + "\n", + "# Build agent\n", + "from crewai_tools import tool\n", + "\n", + "# Evaluate agent\n", + "from google.cloud import aiplatform\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "from vertexai.preview import reasoning_engines\n", + "from vertexai.preview.evaluation import EvalTask\n", + "from vertexai.preview.evaluation.metrics import (\n", + " PointwiseMetric,\n", + " PointwiseMetricPromptTemplate,\n", + " TrajectorySingleToolUse,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MVnBDX54gz7j" + }, + "source": [ + "## Define helper functions\n", + "\n", + "Initiate a set of helper functions to print tutorial results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uSgWjMD_g1_v" + }, + "outputs": [], + "source": [ + "def get_id(length: int = 8) -> str:\n", + " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", + " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", + "\n", + "\n", + "def parse_crewai_output_to_dictionary(crew, crew_output):\n", + " \"\"\"\n", + " Parse CrewAI output into a structured dictionary format.\n", + " \"\"\"\n", + " final_output = {\"response\": str(crew_output), \"predicted_trajectory\": []}\n", + "\n", + " try:\n", + " # Access tools_results directly from each agent\n", + " for agent in crew.agents:\n", + " if hasattr(agent, \"tools_results\"):\n", + " for tool_result in agent.tools_results:\n", + " tool_info = {\n", + " \"tool_name\": tool_result.get(\"tool_name\", \"\"),\n", + " \"tool_input\": tool_result.get(\"tool_args\", {}),\n", + " }\n", + " final_output[\"predicted_trajectory\"].append(tool_info)\n", + "\n", + " except Exception as e:\n", + " final_output[\"error\"] = f\"Error parsing tools results: {str(e)}\"\n", + "\n", + " return final_output\n", + "\n", + "\n", + "def format_output_as_markdown(output: dict) -> str:\n", + " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", + " markdown = \"### AI Response\\n\"\n", + " markdown += f\"{output['response']}\\n\\n\"\n", + "\n", + " if output[\"predicted_trajectory\"]:\n", + " markdown += \"### Function Calls\\n\"\n", + " for call in output[\"predicted_trajectory\"]:\n", + " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", + " markdown += \" - **Arguments**:\\n\"\n", + " for key, value in call[\"tool_input\"].items():\n", + " markdown += f\" - `{key}`: `{value}`\\n\"\n", + "\n", + " return markdown\n", + "\n", + "\n", + "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", + " \"\"\"Display the evaluation results.\"\"\"\n", + " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", + " display(Markdown(\"### Summary Metrics\"))\n", + " display(metrics_df)\n", + "\n", + " display(Markdown(f\"### Row-wise Metrics\"))\n", + " display(eval_result.metrics_table)\n", + "\n", + "\n", + "def display_drilldown(row: pd.Series) -> None:\n", + " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", + "\n", + " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", + "\n", + " if not (\n", + " isinstance(row[\"predicted_trajectory\"], list)\n", + " and isinstance(row[\"reference_trajectory\"], list)\n", + " ):\n", + " return\n", + "\n", + " for predicted_trajectory, reference_trajectory in zip(\n", + " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", + " ):\n", + " display(\n", + " HTML(\n", + " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", + " )\n", + " )\n", + "\n", + " if not (\n", + " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", + " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", + " ):\n", + " continue\n", + "\n", + " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", + " print(\"Tool Input Key: \", tool_input_key)\n", + "\n", + " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " reference_trajectory[\"tool_input\"][tool_input_key],\n", + " )\n", + " else:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " \"N/A\",\n", + " )\n", + " print(\"\\n\")\n", + " display(HTML(\"
\"))\n", + "\n", + "\n", + "def display_dataframe_rows(\n", + " df: pd.DataFrame,\n", + " columns: list[str] | None = None,\n", + " num_rows: int = 3,\n", + " display_drilldown: bool = False,\n", + ") -> None:\n", + " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", + "\n", + " if columns:\n", + " df = df[columns]\n", + "\n", + " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", + " header_style = base_style + \"font-weight: bold;\"\n", + "\n", + " for _, row in df.head(num_rows).iterrows():\n", + " for column in df.columns:\n", + " display(\n", + " HTML(\n", + " f\"{column.replace('_', ' ').title()}: \"\n", + " )\n", + " )\n", + " display(HTML(f\"{row[column]}
\"))\n", + "\n", + " display(HTML(\"
\"))\n", + "\n", + " if (\n", + " display_drilldown\n", + " and \"predicted_trajectory\" in df.columns\n", + " and \"reference_trajectory\" in df.columns\n", + " ):\n", + " display_drilldown(row)\n", + "\n", + "\n", + "def plot_bar_plot(\n", + " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", + ") -> None:\n", + " fig = go.Figure()\n", + " data = []\n", + "\n", + " summary_metrics = eval_result.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " data.append(\n", + " go.Bar(\n", + " x=list(summary_metrics.keys()),\n", + " y=list(summary_metrics.values()),\n", + " name=title,\n", + " )\n", + " )\n", + "\n", + " fig = go.Figure(data=data)\n", + "\n", + " # Change the bar mode\n", + " fig.update_layout(barmode=\"group\")\n", + " fig.show()\n", + "\n", + "\n", + "def display_radar_plot(eval_results, title: str, metrics=None):\n", + " \"\"\"Plot the radar plot.\"\"\"\n", + " fig = go.Figure()\n", + " summary_metrics = eval_results.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " min_val = min(summary_metrics.values())\n", + " max_val = max(summary_metrics.values())\n", + "\n", + " fig.add_trace(\n", + " go.Scatterpolar(\n", + " r=list(summary_metrics.values()),\n", + " theta=list(summary_metrics.keys()),\n", + " fill=\"toself\",\n", + " name=title,\n", + " )\n", + " )\n", + " fig.update_layout(\n", + " title=title,\n", + " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", + " showlegend=True,\n", + " )\n", + " fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bDaa2Mtsifmq" + }, + "source": [ + "## Build an agent using Vertex AI Reasoning Engine's customized template\n", + "\n", + "Build and deploy your application using LangGraph, including the Gemini model and custom tools that you define.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KHwShhpOitKp" + }, + "source": [ + "### Set tools\n", + "\n", + "To start, set the tools that a customer support agent needs to do their job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gA2ZKvfeislw" + }, + "outputs": [], + "source": [ + "@tool\n", + "def get_product_details(product_name: str):\n", + " \"\"\"Gathers basic details about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", + " \"usb charger\": \"A super fast and light usb charger\",\n", + " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", + " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", + " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " }\n", + " return details.get(product_name, \"Product details not found.\")\n", + "\n", + "\n", + "@tool\n", + "def get_product_price(product_name: str):\n", + " \"\"\"Gathers price about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": 500,\n", + " \"usb charger\": 10,\n", + " \"shoes\": 100,\n", + " \"headphones\": 50,\n", + " \"speaker\": 80,\n", + " }\n", + " return details.get(product_name, \"Product price not found.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "be70714d9fae" + }, + "source": [ + "### Define router\n", + "\n", + "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "516b5108d327" + }, + "outputs": [], + "source": [ + "def router(\n", + " state: list[BaseMessage],\n", + ") -> Literal[\"get_product_details\", \"get_product_price\", \"__end__\"]:\n", + " \"\"\"Initiates product details or price retrieval if the user asks for a product.\"\"\"\n", + " # Get the tool_calls from the last message in the conversation history.\n", + " tool_calls = state[-1].tool_calls\n", + "\n", + " # If there are any tool_calls\n", + " if tool_calls:\n", + " # Check the function name in the first tool call\n", + " function_name = tool_calls[0].get(\"name\")\n", + " if function_name == \"get_product_price\":\n", + " return \"get_product_price\"\n", + " else:\n", + " return \"get_product_details\"\n", + " else:\n", + " # End the conversation flow.\n", + " return \"__end__\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FHjhBVx2cHWb" + }, + "source": [ + "### Set the model\n", + "\n", + "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iCx9hbpccHWc" + }, + "outputs": [], + "source": [ + "model = \"gemini-1.5-pro\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNlAY9cojEWz" + }, + "source": [ + "### Assemble the agent\n", + "\n", + "To create a LangGraph agent using [Vertex AI Reasoning Engine](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy), use the [customized template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize).\n", + "\n", + "This class helps you quickly get an agent using any framework running Vertex AI Reasoning Engine.\n", + "\n", + "To learn more about the template, check out [Customize an application template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize) documentation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dAFdi7SujGP8" + }, + "outputs": [], + "source": [ + "class LangGraphApp:\n", + " def __init__(self, project: str, location: str, model: str = model) -> None:\n", + " self.project_id = project\n", + " self.location = location\n", + " self.model = model\n", + "\n", + " # The set_up method is used to define application initialization logic\n", + " def set_up(self) -> None:\n", + " model = ChatVertexAI(model=self.model)\n", + " builder = MessageGraph()\n", + "\n", + " model_with_tools = model.bind_tools([get_product_details, get_product_price])\n", + " builder.add_node(\"tools\", model_with_tools)\n", + "\n", + " tool_node = ToolNode([get_product_details, get_product_price])\n", + " builder.add_node(\"get_product_details\", tool_node)\n", + " builder.add_node(\"get_product_price\", tool_node)\n", + " builder.add_edge(\"get_product_details\", END)\n", + " builder.add_edge(\"get_product_price\", END)\n", + "\n", + " builder.set_entry_point(\"tools\")\n", + " builder.add_conditional_edges(\"tools\", router)\n", + " self.app = builder.compile()\n", + "\n", + " # The query method will be used to send inputs to the agent\n", + " def query(self, input: str):\n", + " \"\"\"Query the application.\"\"\"\n", + " chat_history = langchain_load_dump.dumpd(self.app.invoke(HumanMessage(input)))\n", + " return chat_history\n", + " # return {'output': parse_messages_to_output_dictionary(chat_history)}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_HGcs6PVjRj_" + }, + "source": [ + "### Test the local agent\n", + "\n", + "Query your agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1dXLLgBudu_L" + }, + "outputs": [], + "source": [ + "local_custom_agent = LangGraphApp(project=PROJECT_ID, location=LOCATION)\n", + "local_custom_agent.set_up()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PgkOhPmN3aCZ" + }, + "outputs": [], + "source": [ + "response = local_custom_agent.query(input=\"Get product details for shoes\")\n", + "display(\n", + " Markdown(format_output_as_markdown(parse_messages_to_output_dictionary(response)))\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lGb58OJkjUs9" + }, + "outputs": [], + "source": [ + "response = local_custom_agent.query(input=\"Get product price for shoes\")\n", + "display(\n", + " Markdown(format_output_as_markdown(parse_messages_to_output_dictionary(response)))\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2pSItXD5e4QD" + }, + "source": [ + "### Deploy the local agent to Vertex AI Reasoning Engine\n", + "\n", + "To deploy the local agent on Vertex AI Reasoning Engine, you can use the `create` method by passing the agent and some specify dependencies (`requirements` for external PyPI packages and `extra_packages` for local packages ).\n", + "\n", + "Look at [Deploy the application](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy#create_a_reasoningengine_instance) documentation page to learn more. \n", + "\n", + "> The agent deployment on Vertex AI Reasoning Engine would require ~ 10 mins." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3HLz_a1We4QE" + }, + "outputs": [], + "source": [ + "local_custom_agent = LangGraphApp(project=PROJECT_ID, location=LOCATION)\n", + "\n", + "remote_custom_agent = reasoning_engines.ReasoningEngine.create(\n", + " local_custom_agent,\n", + " requirements=[\n", + " \"google-cloud-aiplatform[langchain,reasoningengine]\",\n", + " \"langchain_google_vertexai\",\n", + " \"langgraph\",\n", + " \"cloudpickle==3.0.0\",\n", + " \"pydantic==2.7.4\",\n", + " \"requests\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nu4RO1P9e4QE" + }, + "source": [ + "### Test the remote agent\n", + "\n", + "Query your remote agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sqBtzYyce4QE" + }, + "outputs": [], + "source": [ + "response = remote_custom_agent.query(input=\"Get product details for shoes\")\n", + "display(\n", + " Markdown(format_output_as_markdown(parse_messages_to_output_dictionary(response)))\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aOGPePsorpUl" + }, + "source": [ + "## Evaluating a custom agent with Vertex AI Gen AI Evaluation\n", + "\n", + "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", + "\n", + "Monitoring focuses on how well your agent is performing specific tasks:\n", + "\n", + "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", + "\n", + "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", + "\n", + "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", + "\n", + "Observability is about understanding the overall health of the agent:\n", + "\n", + "* **Latency**: How long does it take the agent to respond?\n", + "\n", + "* **Failure Rate**: How often does the agent fail to produce a response?\n", + "\n", + "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e43229f3ad4f" + }, + "source": [ + "### Prepare Agent Evaluation dataset\n", + "\n", + "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", + "\n", + "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", + "\n", + "\n", + "> Optionally, you can provide both generated responses and predicted trajectory (**bring-your-own-dataset scenario**).\n", + "\n", + "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fFf8uTdUiDt3" + }, + "outputs": [], + "source": [ + "eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + "}\n", + "\n", + "eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PQEI1EcfvFHb" + }, + "source": [ + "Print some samples from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EjsonqWWvIvE" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(eval_sample_dataset, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "htCrOS9fRVi8" + }, + "source": [ + "### Prepare an Agent function\n", + "\n", + "In this scenario with a custom agent, you need an agent function to parse the agent output and pass it to Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GdO56MIDRZri" + }, + "outputs": [], + "source": [ + "def agent_parsed_response(input: str) -> dict:\n", + " \"\"\"Parse the agent output and pass it to Vertex AI Gen AI Evaluation.\"\"\"\n", + "\n", + " result = remote_custom_agent.query(input=input)\n", + "\n", + " # Parse function calls separately\n", + " agent_output = parse_messages_to_output_dictionary(result)\n", + "\n", + " return agent_output" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m4CvBuf1afHG" + }, + "source": [ + "### Single tool usage evaluation\n", + "\n", + "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_rS5GGKHd5bx" + }, + "source": [ + "#### Set single tool usage metrics\n", + "\n", + "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", + "\n", + "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xixvq8dwd5by" + }, + "outputs": [], + "source": [ + "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ktKZoT2Qd5by" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using a local agent (local_1p_agent) and assigns a unique identifier to this specific evaluation run, storing the evaluation results.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QaMf9dqzySE6" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SRv43fDcd5by" + }, + "outputs": [], + "source": [ + "single_tool_call_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=single_tool_usage_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", + " runnable=agent_parsed_response, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(single_tool_call_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6o5BjSTFKVMS" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Use some helper functions to visualize a sample of evaluation result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZkpwPReipekr" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JlujdJpu5Kn6" + }, + "source": [ + "### Trajectory Evaluation\n", + "\n", + "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8s-nHdDJneHM" + }, + "source": [ + "#### Set trajectory metrics\n", + "\n", + "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", + "\n", + "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", + "\n", + "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", + "\n", + "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", + "\n", + "* `trajectory_precision`: proportion of predicted actions present in reference\n", + "\n", + "* `trajectory_recall`: proportion of reference actions present in predicted. \n", + "\n", + "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c32WIS95neHN" + }, + "outputs": [], + "source": [ + "trajectory_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"trajectory_any_order_match\",\n", + " \"trajectory_precision\",\n", + " \"trajectory_recall\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF3jhTH3neHN" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Submit an evaluation by running `evaluate` method of the new `EvalTask`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vOdS7TJUneHN" + }, + "outputs": [], + "source": [ + "EXPERIMENT_NAME = \"evaluate-re-agent-trajectory\" # @param {type:\"string\"}\n", + "\n", + "trajectory_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=agent_parsed_response)\n", + "\n", + "display_eval_report(trajectory_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DBiUI3LyLBtj" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Print and visualize a sample of evaluation results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sLVRdN5llA0h" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PrxM5sMZYXHP" + }, + "outputs": [], + "source": [ + "plot_bar_plot(\n", + " trajectory_eval_result,\n", + " title=\"Trajectory Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T8TipU2akHEd" + }, + "source": [ + "### Evaluate final response\n", + "\n", + "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DeK-py7ykkDN" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", + "\n", + "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cyGHGgeVklvz" + }, + "outputs": [], + "source": [ + "response_metrics = [\"safety\", \"coherence\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DaBJWcg1kn55" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wRb2EC_hknSD" + }, + "outputs": [], + "source": [ + "EXPERIMENT_NAME = \"evaluate-re-agent-response\" # @param {type:\"string\"}\n", + "\n", + "response_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "response_eval_result = response_eval_task.evaluate(runnable=agent_parsed_response)\n", + "\n", + "display_eval_report(response_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JtewTwiwg9qH" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cy0aRydrp9zW" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ntRBK3Te6PEc" + }, + "source": [ + "### Evaluate generated response conditioned by tool choosing\n", + "\n", + "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", + "\n", + "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4bENwFcd6prX" + }, + "source": [ + "#### Define a custom metric\n", + "\n", + "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", + "\n", + "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "txGEHcg76riI" + }, + "outputs": [], + "source": [ + "criteria = {\n", + " \"Follows trajectory\": (\n", + " \"Evaluate whether the agent's response logically follows from the \"\n", + " \"sequence of actions it took. Consider these sub-points:\\n\"\n", + " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", + " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", + " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", + " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", + " )\n", + "}\n", + "\n", + "pointwise_rating_rubric = {\n", + " \"1\": \"Follows trajectory\",\n", + " \"0\": \"Does not follow trajectory\",\n", + "}\n", + "\n", + "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", + " criteria=criteria,\n", + " rating_rubric=pointwise_rating_rubric,\n", + " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8MJqXu0kikxd" + }, + "source": [ + "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5EL7iEDMikNQ" + }, + "outputs": [], + "source": [ + "print(response_follows_trajectory_prompt_template.prompt_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e1djVp7Fi4Yy" + }, + "source": [ + "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nx1xbZD87iMj" + }, + "outputs": [], + "source": [ + "response_follows_trajectory_metric = PointwiseMetric(\n", + " metric=\"response_follows_trajectory\",\n", + " metric_prompt_template=response_follows_trajectory_prompt_template,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1pmxLwTe7Ywv" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "Set new generated response evaluation metrics by including the custom metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wrsbVFDd7Ywv" + }, + "outputs": [], + "source": [ + "response_tool_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"safety\",\n", + " response_follows_trajectory_metric,\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lo-Sza807Ywv" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_dkb4gSn7Ywv" + }, + "outputs": [], + "source": [ + "EXPERIMENT_NAME = \"evaluate-re-agent-response-by-tools\" # @param {type:\"string\"}\n", + "\n", + "response_eval_tool_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "response_eval_tool_result = response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_response\n", + ")\n", + "\n", + "display_eval_report(response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EFmnRBlWqJnC" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZODTRuq2lF75" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4nuUDP3a2eTB" + }, + "source": [ + "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", + "\n", + "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pNh3-NDuZGDl" + }, + "source": [ + "### Bring your own evaluation dataset\n", + "\n", + "Define the evaluation dataset with the predicted trajectory and the generated response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y9hBgsg324Ej" + }, + "outputs": [], + "source": [ + "byod_eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"predicted_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"response\": [\n", + " 500,\n", + " 50,\n", + " \"A super fast and light usb charger\",\n", + " 100,\n", + " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " ],\n", + "}\n", + "\n", + "byod_eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oEYmU2eJ7q-1" + }, + "source": [ + "### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wBD-4wpB7q-3" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", + "\n", + "byod_response_eval_tool_task = EvalTask(\n", + " dataset=byod_eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", + " experiment_run_name=EXPERIMENT_RUN_NAME\n", + ")\n", + "\n", + "display_eval_report(byod_response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9eU3LG6r7q-3" + }, + "source": [ + "### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pQFzmd2I7q-3" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0FEbvEOkZS8f" + }, + "outputs": [], + "source": [ + "display_radar_plot(\n", + " byod_response_eval_tool_result,\n", + " title=\"Agent evaluation metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2a4e033321ad" + }, + "source": [ + "## Cleaning up\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ox2I3UfRlTOd" + }, + "outputs": [], + "source": [ + "delete_experiment = True\n", + "delete_remote_agent = True\n", + "\n", + "if delete_experiment:\n", + " try:\n", + " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", + " experiment.delete(delete_backing_tensorboard_runs=True)\n", + " except Exception as e:\n", + " print(e)\n", + "\n", + "if delete_remote_agent:\n", + " try:\n", + " remote_custom_agent.delete()\n", + " except Exception as e:\n", + " print(e)" + ] + } + ], + "metadata": { + "colab": { + "name": "evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}