From 56fbaa16400ece7c525b774d485404d9043abccb Mon Sep 17 00:00:00 2001 From: inardini Date: Wed, 18 Dec 2024 08:59:36 +0000 Subject: [PATCH 01/14] evaluate agents notebooks --- .../evaluation/evaluating_crewai_agent.ipynb | 1759 ++++ .../evaluating_langgraph_agent.ipynb | 1753 ++++ ...reasoning_engine_customized_template.ipynb | 1914 ++++ ...t_reasoning_engine_prebuilt_template.ipynb | 1758 ++++ ...reasoning_engine_customized_template.ipynb | 9322 +++++++++++++++++ 5 files changed, 16506 insertions(+) create mode 100644 gemini/evaluation/evaluating_crewai_agent.ipynb create mode 100644 gemini/evaluation/evaluating_langgraph_agent.ipynb create mode 100644 gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb create mode 100644 gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb create mode 100644 gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb diff --git a/gemini/evaluation/evaluating_crewai_agent.ipynb b/gemini/evaluation/evaluating_crewai_agent.ipynb new file mode 100644 index 0000000000..0b3cec2596 --- /dev/null +++ b/gemini/evaluation/evaluating_crewai_agent.ipynb @@ -0,0 +1,1759 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Evaluating Agents - Evaluate a CrewAI agent with Vertex AI Gen AI Evaluation\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Google
Open in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Open in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "Share to:\n", + "\n", + "\n", + " \"LinkedIn\n", + "\n", + "\n", + "\n", + " \"Bluesky\n", + "\n", + "\n", + "\n", + " \"X\n", + "\n", + "\n", + "\n", + " \"Reddit\n", + "\n", + "\n", + "\n", + " \"Facebook\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "84f0f73a0f76" + }, + "source": [ + "| | |\n", + "|-|-|\n", + "| Authors | [Ivan Nardini](https://github.com/inardini), [Naveksha Sood](https://github.com/navekshasood) |" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", + "\n", + "Vertex AI Model Evaluation provides a toolkit of quality controlled and explainable methods and metrics to evaluate any generative model or application, including agents, and benchmark the evaluation results against your own judgment, using your own evaluation criteria.\n", + "\n", + "This tutorial shows how to evaluate a Crew AI agent using Vertex AI Gen AI Evaluation for agent evaluation.\n", + "\n", + "The tutorial uses the following Google Cloud services and resources:\n", + "\n", + "* Vertex AI Gen AI Evaluation\n", + "\n", + "The steps performed include:\n", + "\n", + "* Build local agent using Crew AI\n", + "* Prepare Agent Evaluation dataset\n", + "* Single tool usage evaluation\n", + "* Trajectory evaluation\n", + "* Response evaluation\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "61RBz8LLbxCR" + }, + "source": [ + "## Get started" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "No17Cw5hgx12" + }, + "source": [ + "### Install Vertex AI SDK and other required packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 13355, + "status": "ok", + "timestamp": 1734464541030, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "tFy3H3aPgx12", + "outputId": "12b6f569-a1ba-41f2-ef43-4a7102ce0661" + }, + "outputs": [], + "source": [ + "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation]\" \\\n", + " \"crewai\" \"crewai-tools\" \\\n", + " \"cloudpickle==3.0.0\" \\\n", + " \"pydantic==2.7.4\" \\\n", + " \"requests\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R5Xep4W9lq-Z" + }, + "source": [ + "### Restart runtime\n", + "\n", + "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", + "\n", + "The restart might take a minute or longer. After it's restarted, continue to the next step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 220, + "status": "ok", + "timestamp": 1734464586580, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "XRvKdaPDTznN", + "outputId": "de2a6a54-dbbc-45ac-d488-3f3b31972325" + }, + "outputs": [], + "source": [ + "import IPython\n", + "\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SbmM4z7FOBpM" + }, + "source": [ + "
\n", + "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dmWOrTJ3gx13" + }, + "source": [ + "### Authenticate your notebook environment (Colab only)\n", + "\n", + "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NyKGtVQjgx13" + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " from google.colab import auth\n", + "\n", + " auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF4l8DTdWgPY" + }, + "source": [ + "### Set Google Cloud project information and initialize Vertex AI SDK\n", + "\n", + "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 57 + }, + "executionInfo": { + "elapsed": 12971, + "status": "ok", + "timestamp": 1734464619145, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "Nqwi-5ufWp_B", + "outputId": "9f529389-522f-4e1e-c41e-17578ef0ac74" + }, + "outputs": [], + "source": [ + "# Use the environment variable if the user doesn't provide Project ID.\n", + "import os\n", + "import vertexai\n", + "\n", + "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", + "\n", + "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", + " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", + "os.environ['GOOGLE_CLOUD_PROJECT'] = PROJECT_ID\n", + "\n", + "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", + "\n", + "EXPERIMENT_NAME = \"evaluate-crewai-agent\" # @param {type:\"string\"}\n", + "\n", + "vertexai.init(project=PROJECT_ID, location=LOCATION, experiment=EXPERIMENT_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5303c05f7aa6" + }, + "source": [ + "## Import libraries\n", + "\n", + "Import tutorial libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6fc324893334" + }, + "outputs": [], + "source": [ + "# General\n", + "import random\n", + "import string\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "from IPython.display import display, Markdown, HTML\n", + "from typing import Literal\n", + "import warnings\n", + "warnings.filterwarnings('ignore', category=Warning, module='opentelemetry.trace')\n", + "\n", + "# Build agent\n", + "from crewai import Agent, Task, Crew, Process\n", + "from crewai_tools import tool\n", + "from crewai.flow.flow import Flow, listen, start\n", + "\n", + "# Evaluate agent\n", + "from google.cloud import aiplatform\n", + "from vertexai.preview.evaluation import EvalTask\n", + "from vertexai.preview.evaluation.metrics import TrajectorySingleToolUse, PointwiseMetricPromptTemplate, PointwiseMetric" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MVnBDX54gz7j" + }, + "source": [ + "## Define helper functions\n", + "\n", + "Initiate a set of helper functions to print tutorial results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uSgWjMD_g1_v" + }, + "outputs": [], + "source": [ + "def get_id(length: int = 8) -> str:\n", + " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", + " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", + "\n", + "def parse_crewai_output_to_dictionary(crew, crew_output):\n", + " \"\"\"\n", + " Parse CrewAI output into a structured dictionary format.\n", + " \"\"\"\n", + " final_output = {\n", + " 'response': str(crew_output),\n", + " 'predicted_trajectory': []\n", + " }\n", + "\n", + " try:\n", + " # Access tools_results directly from each agent\n", + " for agent in crew.agents:\n", + " if hasattr(agent, 'tools_results'):\n", + " for tool_result in agent.tools_results:\n", + " tool_info = {\n", + " 'tool_name': tool_result.get('tool_name', ''),\n", + " 'tool_input': tool_result.get('tool_args', {})\n", + " }\n", + " final_output['predicted_trajectory'].append(tool_info)\n", + "\n", + " except Exception as e:\n", + " final_output['error'] = f\"Error parsing tools results: {str(e)}\"\n", + "\n", + " return final_output\n", + "\n", + "def format_output_as_markdown(output: dict) -> str:\n", + " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", + " markdown = \"### AI Response\\n\"\n", + " markdown += f\"{output['response']}\\n\\n\"\n", + "\n", + " if output['predicted_trajectory']:\n", + " markdown += \"### Function Calls\\n\"\n", + " for call in output['predicted_trajectory']:\n", + " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", + " markdown += \" - **Arguments**:\\n\"\n", + " for key, value in call['tool_input'].items():\n", + " markdown += f\" - `{key}`: `{value}`\\n\"\n", + "\n", + " return markdown\n", + "\n", + "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", + " \"\"\"Display the evaluation results.\"\"\"\n", + " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient='index').T\n", + " display(Markdown(\"### Summary Metrics\"))\n", + " display(metrics_df)\n", + "\n", + " display(Markdown(f\"### Row-wise Metrics\"))\n", + " display(eval_result.metrics_table)\n", + "\n", + "def display_drilldown(row: pd.Series) -> None:\n", + " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", + "\n", + " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", + "\n", + " if not (isinstance(row['predicted_trajectory'], list) and isinstance(row['reference_trajectory'], list)):\n", + " return\n", + "\n", + " for predicted_trajectory, reference_trajectory in zip(row['predicted_trajectory'], row['reference_trajectory']):\n", + " display(HTML(f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"))\n", + "\n", + "\n", + " if not (isinstance(predicted_trajectory.get('tool_input'), dict) and isinstance(reference_trajectory.get('tool_input'), dict)):\n", + " continue\n", + "\n", + "\n", + " for tool_input_key in predicted_trajectory['tool_input']:\n", + " print(\"Tool Input Key: \", tool_input_key)\n", + "\n", + " if tool_input_key in reference_trajectory['tool_input']:\n", + " print(\"Tool Values: \", predicted_trajectory['tool_input'][tool_input_key], reference_trajectory['tool_input'][tool_input_key])\n", + " else:\n", + " print(\"Tool Values: \", predicted_trajectory['tool_input'][tool_input_key], \"N/A\")\n", + " print(\"\\n\")\n", + " display(HTML(\"
\"))\n", + "\n", + "def display_dataframe_rows(\n", + " df: pd.DataFrame,\n", + " columns: list[str] | None = None,\n", + " num_rows: int = 3,\n", + " display_drilldown: bool = False\n", + ") -> None:\n", + " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", + "\n", + " if columns:\n", + " df = df[columns]\n", + "\n", + " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", + " header_style = base_style + \"font-weight: bold;\"\n", + "\n", + " for _, row in df.head(num_rows).iterrows():\n", + " for column in df.columns:\n", + " display(HTML(f\"{column.replace('_', ' ').title()}: \"))\n", + " display(HTML(f\"{row[column]}
\"))\n", + "\n", + " display(HTML(\"
\"))\n", + "\n", + " if display_drilldown and 'predicted_trajectory' in df.columns and 'reference_trajectory' in df.columns:\n", + " display_drilldown(row)\n", + "\n", + "def plot_bar_plot(\n", + " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", + ") -> None:\n", + " fig = go.Figure()\n", + " data = []\n", + "\n", + " summary_metrics = eval_result.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " data.append(\n", + " go.Bar(\n", + " x=list(summary_metrics.keys()),\n", + " y=list(summary_metrics.values()),\n", + " name=title,\n", + " )\n", + " )\n", + "\n", + " fig = go.Figure(data=data)\n", + "\n", + " # Change the bar mode\n", + " fig.update_layout(barmode=\"group\")\n", + " fig.show()\n", + "\n", + "def display_radar_plot(eval_results, title: str, metrics=None):\n", + " \"\"\"Plot the radar plot.\"\"\"\n", + " fig = go.Figure()\n", + " summary_metrics = eval_results.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " min_val = min(summary_metrics.values())\n", + " max_val = max(summary_metrics.values())\n", + "\n", + " fig.add_trace(\n", + " go.Scatterpolar(\n", + " r=list(summary_metrics.values()),\n", + " theta=list(summary_metrics.keys()),\n", + " fill=\"toself\",\n", + " name=title,\n", + " )\n", + " )\n", + " fig.update_layout(\n", + " title=title,\n", + " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", + " showlegend=True\n", + " )\n", + " fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bDaa2Mtsifmq" + }, + "source": [ + "## Build CrewAI agent\n", + "\n", + "Build your application using CrewAI, including the Gemini model and custom tools that you define." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KHwShhpOitKp" + }, + "source": [ + "### Set tools\n", + "\n", + "To start, set the tools that a customer support agent needs to do their job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gA2ZKvfeislw" + }, + "outputs": [], + "source": [ + "@tool\n", + "def get_product_details(product_name: str):\n", + " \"\"\"Gathers basic details about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", + " \"usb charger\": \"A super fast and light usb charger\",\n", + " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", + " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", + " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " }\n", + " return details.get(product_name, \"Product details not found.\")\n", + "\n", + "\n", + "@tool\n", + "def get_product_price(product_name: str):\n", + " \"\"\"Gathers price about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": 500,\n", + " \"usb charger\": 10,\n", + " \"shoes\": 100,\n", + " \"headphones\": 50,\n", + " \"speaker\": 80,\n", + " }\n", + " return details.get(product_name, \"Product price not found.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "be70714d9fae" + }, + "source": [ + "### Define router using Flow\n", + "\n", + "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "516b5108d327" + }, + "outputs": [], + "source": [ + "class ProductFlow(Flow):\n", + " @start\n", + " def begin_flow(self):\n", + " \"\"\"Starts the product information flow\"\"\"\n", + " return \"check_request\"\n", + "\n", + " @listen(\"check_request\")\n", + " def router(self, state: dict) -> str:\n", + " \"\"\"Routes the product request to appropriate handler\"\"\"\n", + " # Get the last message from the state\n", + " last_message = state.get(\"last_message\", {})\n", + " tool_calls = last_message.get(\"tool_calls\", [])\n", + "\n", + " if tool_calls:\n", + " function_name = tool_calls[0].get('name')\n", + " if function_name == \"get_product_price\":\n", + " return \"get_product_price\"\n", + " else:\n", + " return \"get_product_details\"\n", + " return \"end\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l4mk5XPui4Y1" + }, + "source": [ + "### Set the model\n", + "\n", + "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BaYeo6K2i-w1" + }, + "outputs": [], + "source": [ + "model = \"vertex_ai/gemini-1.5-pro-002\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNlAY9cojEWz" + }, + "source": [ + "### Assemble the agent\n", + "\n", + "The Vertex AI GenAI Evaluation works directly with 'Queryable' agents, and also lets you add your own custom functions with a specific structure (signature).\n", + "\n", + "In this case, you assemble the agent using a custom function. The function triggers the agent for a given input and parse the agent outcome to extract the response and called tools." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dAFdi7SujGP8" + }, + "outputs": [], + "source": [ + "def agent_parsed_outcome(input):\n", + " product_researcher = Agent(\n", + " role='Product Researcher',\n", + " goal='Research product details and prices accurately',\n", + " backstory='Expert at gathering and analyzing product information',\n", + " llm=model,\n", + " tools=[get_product_details, get_product_price],\n", + " allow_delegation=False\n", + " )\n", + "\n", + " # Create task based on the input\n", + " research_task = Task(\n", + " description=f\"Analyze this user request: '{input}'. \"\n", + " f\"If the request is about price, use get_product_price tool. \"\n", + " f\"Otherwise, use get_product_details tool to get product information.\",\n", + " expected_output=\"Product information including details and/or price based on the user request.\",\n", + " agent=product_researcher,\n", + " )\n", + "\n", + " # Create crew with sequential process\n", + " crew = Crew(\n", + " agents=[product_researcher],\n", + " tasks=[research_task],\n", + " process=Process.sequential,\n", + " )\n", + "\n", + " result = crew.kickoff()\n", + " return parse_crewai_output_to_dictionary(crew, result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_HGcs6PVjRj_" + }, + "source": [ + "### Test the agent\n", + "\n", + "Query your agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 190 + }, + "executionInfo": { + "elapsed": 6889, + "status": "ok", + "timestamp": 1734464661380, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "lGb58OJkjUs9", + "outputId": "f63e6f69-06b8-4818-e881-fda0fda5c474" + }, + "outputs": [], + "source": [ + "response = agent_parsed_outcome(input=\"Get product details for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 207 + }, + "executionInfo": { + "elapsed": 4501, + "status": "ok", + "timestamp": 1734464665879, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "2wCFstt8w4Dx", + "outputId": "0d159ba7-7872-4b6e-d7cb-b505810ab1bb" + }, + "outputs": [], + "source": [ + "response = agent_parsed_outcome(input=\"Get product price for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aOGPePsorpUl" + }, + "source": [ + "## Evaluating a CrewAI agent with Vertex AI Gen AI Evaluation\n", + "\n", + "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", + "\n", + "Monitoring focuses on how well your agent is performing specific tasks:\n", + "\n", + "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", + "\n", + "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", + "\n", + "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", + "\n", + "Observability is about understanding the overall health of the agent:\n", + "\n", + "* **Latency**: How long does it take the agent to respond?\n", + "\n", + "* **Failure Rate**: How often does the agent fail to produce a response?\n", + "\n", + "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e43229f3ad4f" + }, + "source": [ + "### Prepare Agent Evaluation dataset\n", + "\n", + "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", + "\n", + "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", + "\n", + "> Optionally, you can provide both generated responses and predicted trajectory (**Bring-Your-Own-Dataset scenario**).\n", + "\n", + "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fFf8uTdUiDt3" + }, + "outputs": [], + "source": [ + "eval_data = {\n", + " 'prompt': [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\"\n", + " ],\n", + " 'reference_trajectory': [\n", + " [{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}],\n", + " [\n", + " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}},\n", + " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}\n", + " ],\n", + " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}],\n", + " [\n", + " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'shoes'}},\n", + " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'shoes'}}\n", + " ],\n", + " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'speaker'}}]\n", + " ]\n", + "}\n", + "\n", + "eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PQEI1EcfvFHb" + }, + "source": [ + "Print some samples from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 273 + }, + "executionInfo": { + "elapsed": 242, + "status": "ok", + "timestamp": 1734464674834, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "EjsonqWWvIvE", + "outputId": "8e466625-dd68-4d8d-fb30-8ee5969c2d5d" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(eval_sample_dataset, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m4CvBuf1afHG" + }, + "source": [ + "### Single tool usage evaluation\n", + "\n", + "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_rS5GGKHd5bx" + }, + "source": [ + "#### Set single tool usage metrics\n", + "\n", + "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", + "\n", + "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xixvq8dwd5by" + }, + "outputs": [], + "source": [ + "single_tool_usage_metrics = [\n", + " TrajectorySingleToolUse(tool_name='get_product_price')\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ktKZoT2Qd5by" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using agent_parsed_outcome function and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 680 + }, + "executionInfo": { + "elapsed": 22544, + "status": "ok", + "timestamp": 1734464702337, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "SRv43fDcd5by", + "outputId": "cc26aba6-9bab-4a01-9e27-51da533b2e62" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"\n", + "\n", + "single_tool_call_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=single_tool_usage_metrics,\n", + " experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(runnable=agent_parsed_outcome,\n", + " experiment_run_name=EXPERIMENT_RUN)\n", + "\n", + "display_eval_report(single_tool_call_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6o5BjSTFKVMS" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Use some helper functions to visualize a sample of evaluation result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 794 + }, + "executionInfo": { + "elapsed": 230, + "status": "ok", + "timestamp": 1734464841587, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "1Jopzw83k14w", + "outputId": "4b2963d7-5fad-4c83-c78f-88b2df7c407b" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JlujdJpu5Kn6" + }, + "source": [ + "### Trajectory Evaluation\n", + "\n", + "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8s-nHdDJneHM" + }, + "source": [ + "#### Set trajectory metrics\n", + "\n", + "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", + "\n", + "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", + "\n", + "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", + "\n", + "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", + "\n", + "* `trajectory_precision`: proportion of predicted actions present in reference\n", + "\n", + "* `trajectory_recall`: proportion of reference actions present in predicted. \n", + "\n", + "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c32WIS95neHN" + }, + "outputs": [], + "source": [ + "trajectory_metrics = [\n", + " \"trajectory_exact_match\", \"trajectory_in_order_match\", \"trajectory_any_order_match\", \"trajectory_precision\", \"trajectory_recall\"\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF3jhTH3neHN" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 943 + }, + "executionInfo": { + "elapsed": 43240, + "status": "ok", + "timestamp": 1734465074991, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "vOdS7TJUneHN", + "outputId": "c652c7d5-6723-45d8-8744-9a22cc6f89bf" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"trajectory-{get_id()}\"\n", + "\n", + "trajectory_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=trajectory_metrics,\n", + " experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=agent_parsed_outcome,\n", + " experiment_run_name=EXPERIMENT_RUN)\n", + "\n", + "display_eval_report(trajectory_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DBiUI3LyLBtj" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Print and visualize a sample of evaluation results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "executionInfo": { + "elapsed": 500, + "status": "ok", + "timestamp": 1734465076492, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "z7-LdM3mLBtk", + "outputId": "6bee7ab1-cf45-4f46-f35e-c1293cf53dfd" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 542 + }, + "executionInfo": { + "elapsed": 215, + "status": "ok", + "timestamp": 1734465107714, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "sLVRdN5llA0h", + "outputId": "434d4447-c5d1-4e8c-fd8f-57a8c7e74d61" + }, + "outputs": [], + "source": [ + "plot_bar_plot(trajectory_eval_result, title=\"Trajectory Metrics\", metrics=[f'{metric}/mean' for metric in trajectory_metrics])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T8TipU2akHEd" + }, + "source": [ + "### Evaluate final response\n", + "\n", + "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DeK-py7ykkDN" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", + "\n", + "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cyGHGgeVklvz" + }, + "outputs": [], + "source": [ + "response_metrics = [\n", + " 'safety', 'coherence'\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DaBJWcg1kn55" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 558 + }, + "executionInfo": { + "elapsed": 36837, + "status": "ok", + "timestamp": 1734465316039, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "wRb2EC_hknSD", + "outputId": "60585365-931e-4cef-98af-b88d2e2ec8c0" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"response-{get_id()}\"\n", + "\n", + "response_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=response_metrics,\n", + " experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "response_eval_result = response_eval_task.evaluate(runnable=agent_parsed_outcome,\n", + " experiment_run_name=EXPERIMENT_RUN)\n", + "\n", + "display_eval_report(response_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JtewTwiwg9qH" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "executionInfo": { + "elapsed": 229, + "status": "ok", + "timestamp": 1734465218165, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "ZODTRuq2lF75", + "outputId": "cabe4dcf-d02f-4e5d-dcf8-103436751c38" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ntRBK3Te6PEc" + }, + "source": [ + "### Evaluate generated response conditioned by tool choosing\n", + "\n", + "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", + "\n", + "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4bENwFcd6prX" + }, + "source": [ + "#### Define a custom metric\n", + "\n", + "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", + "\n", + "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "txGEHcg76riI" + }, + "outputs": [], + "source": [ + "criteria = {\n", + " \"Follows trajectory\": (\n", + " \"Evaluate whether the agent's response logically follows from the \"\n", + " \"sequence of actions it took. Consider these sub-points:\\n\"\n", + " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", + " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", + " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", + " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", + " )\n", + " }\n", + "\n", + "pointwise_rating_rubric = {\n", + " \"1\": \"Follows trajectory\",\n", + " \"0\": \"Does not follow trajectory\",\n", + "}\n", + "\n", + "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", + " criteria=criteria,\n", + " rating_rubric=pointwise_rating_rubric,\n", + " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8MJqXu0kikxd" + }, + "source": [ + "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5EL7iEDMikNQ" + }, + "outputs": [], + "source": [ + "print(response_follows_trajectory_prompt_template.prompt_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e1djVp7Fi4Yy" + }, + "source": [ + "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nx1xbZD87iMj" + }, + "outputs": [], + "source": [ + "response_follows_trajectory_metric = PointwiseMetric(\n", + " metric=\"response_follows_trajectory\",\n", + " metric_prompt_template=response_follows_trajectory_prompt_template,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1pmxLwTe7Ywv" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "Set new generated response evaluation metrics by including the custom metric.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wrsbVFDd7Ywv" + }, + "outputs": [], + "source": [ + "response_tool_metrics = [\n", + " \"trajectory_exact_match\", \"trajectory_in_order_match\", \"safety\", response_follows_trajectory_metric\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lo-Sza807Ywv" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 804 + }, + "executionInfo": { + "elapsed": 38880, + "status": "ok", + "timestamp": 1734465423880, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "_dkb4gSn7Ywv", + "outputId": "2a7ff186-9648-4bb9-fa91-88ce10b2a54a" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"response-over-tools-{get_id()}\"\n", + "\n", + "response_eval_tool_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "response_eval_tool_result = response_eval_tool_task.evaluate(runnable=agent_parsed_outcome,\n", + " experiment_run_name=EXPERIMENT_RUN)\n", + "\n", + "display_eval_report(response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AtOfIFi2j88g" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "executionInfo": { + "elapsed": 247, + "status": "ok", + "timestamp": 1734465424124, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "GH2YvXgLlLH7", + "outputId": "4d7088d0-389d-4924-ff57-5de8db3dfb6b" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 542 + }, + "executionInfo": { + "elapsed": 3, + "status": "ok", + "timestamp": 1734465424974, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "tdVhCURXMdLG", + "outputId": "9de4b57c-0fa8-44e0-a649-3ec37c57b6d9" + }, + "outputs": [], + "source": [ + "plot_bar_plot(response_eval_tool_result, title=\"Response Metrics\", metrics=[f'{metric}/mean' for metric in response_tool_metrics])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4nuUDP3a2eTB" + }, + "source": [ + "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", + "\n", + "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DRLKlmWd27PK" + }, + "source": [ + "### Bring your own evaluation dataset\n", + "\n", + "Define the evaluation dataset with the predicted trajectory and the generated response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y9hBgsg324Ej" + }, + "outputs": [], + "source": [ + "byod_eval_data = {\n", + " 'prompt': [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\"\n", + " ],\n", + " 'reference_trajectory': [\n", + " [{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}],\n", + " [\n", + " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}},\n", + " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}\n", + " ],\n", + " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}],\n", + " [\n", + " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'shoes'}},\n", + " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'shoes'}}\n", + " ],\n", + " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'speaker'}}]\n", + " ],\n", + " 'generated_trajectory': [\n", + " [{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}],\n", + " [\n", + " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}},\n", + " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}\n", + " ],\n", + " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}],\n", + " [\n", + " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'shoes'}},\n", + " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'shoes'}}\n", + " ],\n", + " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'speaker'}}]\n", + " ],\n", + " 'response': [500, 50, 'A super fast and light usb charger', 100, 'A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.']\n", + "\n", + "}\n", + "\n", + "byod_eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oEYmU2eJ7q-1" + }, + "source": [ + "### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 804 + }, + "executionInfo": { + "elapsed": 41869, + "status": "ok", + "timestamp": 1734466280031, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "wBD-4wpB7q-3", + "outputId": "6928eeb6-902b-44b8-ca89-19ecce03df20" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", + "\n", + "byod_response_eval_tool_task = EvalTask(\n", + " dataset=byod_eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(runnable=agent_parsed_outcome,\n", + " experiment_run_name=EXPERIMENT_RUN_NAME)\n", + "\n", + "display_eval_report(byod_response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9eU3LG6r7q-3" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "executionInfo": { + "elapsed": 1576, + "status": "ok", + "timestamp": 1734466281336, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "pQFzmd2I7q-3", + "outputId": "410f6ee9-57f4-4304-c573-1a86b21e51af" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 542 + }, + "executionInfo": { + "elapsed": 232, + "status": "ok", + "timestamp": 1734466361805, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "84HiPDOkPseW", + "outputId": "72d05905-3081-4bb6-a911-49796c9815a6" + }, + "outputs": [], + "source": [ + "display_radar_plot(byod_response_eval_tool_result, title=\"Response Metrics\", metrics=[f'{metric}/mean' for metric in response_tool_metrics])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2a4e033321ad" + }, + "source": [ + "## Cleaning up\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ox2I3UfRlTOd" + }, + "outputs": [], + "source": [ + "delete_experiment=True\n", + "\n", + "if delete_experiment:\n", + " try:\n", + " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", + " experiment.delete(delete_backing_tensorboard_runs=True)\n", + " except Exception as e:\n", + " print(e)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [], + "toc_visible": true + }, + "environment": { + "kernel": "python3", + "name": "common-cpu.m126", + "type": "gcloud", + "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/base-cpu:m126" + }, + "kernelspec": { + "display_name": "Python 3 (Local)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/gemini/evaluation/evaluating_langgraph_agent.ipynb b/gemini/evaluation/evaluating_langgraph_agent.ipynb new file mode 100644 index 0000000000..095eebd601 --- /dev/null +++ b/gemini/evaluation/evaluating_langgraph_agent.ipynb @@ -0,0 +1,1753 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Evaluating Agents - Evaluate an LangGraph agent with Vertex AI Gen AI Evaluation\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Google
Open in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Open in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "Share to:\n", + "\n", + "\n", + " \"LinkedIn\n", + "\n", + "\n", + "\n", + " \"Bluesky\n", + "\n", + "\n", + "\n", + " \"X\n", + "\n", + "\n", + "\n", + " \"Reddit\n", + "\n", + "\n", + "\n", + " \"Facebook\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "84f0f73a0f76" + }, + "source": [ + "| | |\n", + "|-|-|\n", + "| Authors | [Ivan Nardini](https://github.com/inardini) [Naveksha Sood](https://github.com/navekshasood)|" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", + "\n", + "Vertex AI Model Evaluation provides a toolkit of quality controlled and explainable methods and metrics to evaluate any generative model or application, including agents, and benchmark the evaluation results against your own judgment, using your own evaluation criteria.\n", + "\n", + "This tutorial shows how to evaluate a LangGraph agent using Vertex AI Gen AI Evaluation for agent evaluation.\n", + "\n", + "The tutorial uses the following Google Cloud services and resources:\n", + "\n", + "* Vertex AI Gen AI Evaluation\n", + "\n", + "The steps performed include:\n", + "\n", + "* Build local agent using LangGraph\n", + "* Prepare Agent Evaluation dataset\n", + "* Single tool usage evaluation\n", + "* Trajectory evaluation\n", + "* Response evaluation\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "61RBz8LLbxCR" + }, + "source": [ + "## Get started" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "No17Cw5hgx12" + }, + "source": [ + "### Install Vertex AI SDK and other required packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 22416, + "status": "ok", + "timestamp": 1734466630955, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "tFy3H3aPgx12", + "outputId": "14563449-48f5-4fe2-ef6b-07fb5a38df3d" + }, + "outputs": [], + "source": [ + "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation]\" \\\n", + " \"langchain_google_vertexai\" \\\n", + " \"langgraph\" \\\n", + " \"cloudpickle==3.0.0\" \\\n", + " \"pydantic==2.7.4\" \\\n", + " \"requests\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R5Xep4W9lq-Z" + }, + "source": [ + "### Restart runtime\n", + "\n", + "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", + "\n", + "The restart might take a minute or longer. After it's restarted, continue to the next step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 223, + "status": "ok", + "timestamp": 1734466639485, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "XRvKdaPDTznN", + "outputId": "73a1ca2d-66c9-4f50-f6c0-a9eb0971ea2a" + }, + "outputs": [], + "source": [ + "import IPython\n", + "\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SbmM4z7FOBpM" + }, + "source": [ + "
\n", + "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dmWOrTJ3gx13" + }, + "source": [ + "### Authenticate your notebook environment (Colab only)\n", + "\n", + "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NyKGtVQjgx13" + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " from google.colab import auth\n", + "\n", + " auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF4l8DTdWgPY" + }, + "source": [ + "### Set Google Cloud project information and initialize Vertex AI SDK\n", + "\n", + "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 57 + }, + "executionInfo": { + "elapsed": 13286, + "status": "ok", + "timestamp": 1734466891717, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "Nqwi-5ufWp_B", + "outputId": "d12e3817-c999-459e-c98f-363fd917c24c" + }, + "outputs": [], + "source": [ + "# Use the environment variable if the user doesn't provide Project ID.\n", + "import os\n", + "\n", + "import vertexai\n", + "\n", + "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", + "\n", + "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", + " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", + "\n", + "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", + "\n", + "EXPERIMENT_NAME = \"evaluate-langgraph-agent\" # @param {type:\"string\"}\n", + "\n", + "vertexai.init(project=PROJECT_ID, location=LOCATION, experiment=EXPERIMENT_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5303c05f7aa6" + }, + "source": [ + "## Import libraries\n", + "\n", + "Import tutorial libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6fc324893334" + }, + "outputs": [], + "source": [ + "# General\n", + "import random\n", + "import string\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "from IPython.display import display, Markdown, HTML\n", + "from typing import Literal\n", + "\n", + "# Build agent\n", + "from langchain_core.messages import BaseMessage, HumanMessage\n", + "from langchain_core.tools import tool\n", + "from langchain_google_vertexai import ChatVertexAI\n", + "from langgraph.graph import END, MessageGraph\n", + "from langgraph.prebuilt import ToolNode\n", + "from langchain.load import dump as langchain_load_dump\n", + "\n", + "# Evaluate agent\n", + "from google.cloud import aiplatform\n", + "from vertexai.preview.evaluation import EvalTask\n", + "from vertexai.preview.evaluation.metrics import TrajectorySingleToolUse, PointwiseMetricPromptTemplate, PointwiseMetric" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MVnBDX54gz7j" + }, + "source": [ + "## Define helper functions\n", + "\n", + "Initiate a set of helper functions to print tutorial results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uSgWjMD_g1_v" + }, + "outputs": [], + "source": [ + "def get_id(length: int = 8) -> str:\n", + " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", + " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", + "\n", + "def parse_messages_to_output_dictionary(messages: list[dict]) -> dict:\n", + " \"\"\"Parse response and function calls from a list of messages in the constructor format.\"\"\"\n", + "\n", + " final_output = {'response': \"No AI response found in the message history.\",\n", + " 'predicted_trajectory': []}\n", + "\n", + " # Process each message\n", + " function_calls = []\n", + " for message in messages:\n", + " # Check if it's a Tool message which contains the actual response\n", + " if message.get('type') == 'constructor' and 'ToolMessage' in message.get('id', []):\n", + " final_output['response'] = message['kwargs']['content']\n", + "\n", + " # Check if it's an AI message to get tool calls\n", + " elif message.get('type') == 'constructor' and 'AIMessage' in message.get('id', []):\n", + " tool_calls = message['kwargs'].get('tool_calls', [])\n", + " for tool_call in tool_calls:\n", + " if tool_call:\n", + " function_calls.append({\n", + " \"tool_name\": tool_call.get('name'),\n", + " \"tool_input\": tool_call.get('args')\n", + " })\n", + "\n", + " final_output['predicted_trajectory'] = function_calls\n", + " return final_output\n", + "\n", + "def format_output_as_markdown(output: dict) -> str:\n", + " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", + " markdown = \"### AI Response\\n\"\n", + " markdown += f\"{output['response']}\\n\\n\"\n", + "\n", + " if output['predicted_trajectory']:\n", + " markdown += \"### Function Calls\\n\"\n", + " for call in output['predicted_trajectory']:\n", + " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", + " markdown += \" - **Arguments**:\\n\"\n", + " for key, value in call['tool_input'].items():\n", + " markdown += f\" - `{key}`: `{value}`\\n\"\n", + "\n", + " return markdown\n", + "\n", + "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", + " \"\"\"Display the evaluation results.\"\"\"\n", + " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient='index').T\n", + " display(Markdown(\"### Summary Metrics\"))\n", + " display(metrics_df)\n", + "\n", + " display(Markdown(f\"### Row-wise Metrics\"))\n", + " display(eval_result.metrics_table)\n", + "\n", + "def display_drilldown(row: pd.Series) -> None:\n", + " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", + "\n", + " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", + "\n", + " if not (isinstance(row['predicted_trajectory'], list) and isinstance(row['reference_trajectory'], list)):\n", + " return\n", + "\n", + " for predicted_trajectory, reference_trajectory in zip(row['predicted_trajectory'], row['reference_trajectory']):\n", + " display(HTML(f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"))\n", + "\n", + "\n", + " if not (isinstance(predicted_trajectory.get('tool_input'), dict) and isinstance(reference_trajectory.get('tool_input'), dict)):\n", + " continue\n", + "\n", + "\n", + " for tool_input_key in predicted_trajectory['tool_input']:\n", + " print(\"Tool Input Key: \", tool_input_key)\n", + "\n", + " if tool_input_key in reference_trajectory['tool_input']:\n", + " print(\"Tool Values: \", predicted_trajectory['tool_input'][tool_input_key], reference_trajectory['tool_input'][tool_input_key])\n", + " else:\n", + " print(\"Tool Values: \", predicted_trajectory['tool_input'][tool_input_key], \"N/A\")\n", + " print(\"\\n\")\n", + " display(HTML(\"
\"))\n", + "\n", + "def display_dataframe_rows(\n", + " df: pd.DataFrame,\n", + " columns: list[str] | None = None,\n", + " num_rows: int = 3,\n", + " display_drilldown: bool = False\n", + ") -> None:\n", + " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", + "\n", + " if columns:\n", + " df = df[columns]\n", + "\n", + " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", + " header_style = base_style + \"font-weight: bold;\"\n", + "\n", + " for _, row in df.head(num_rows).iterrows():\n", + " for column in df.columns:\n", + " display(HTML(f\"{column.replace('_', ' ').title()}: \"))\n", + " display(HTML(f\"{row[column]}
\"))\n", + "\n", + " display(HTML(\"
\"))\n", + "\n", + " if display_drilldown and 'predicted_trajectory' in df.columns and 'reference_trajectory' in df.columns:\n", + " display_drilldown(row)\n", + "\n", + "def plot_bar_plot(\n", + " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", + ") -> None:\n", + " fig = go.Figure()\n", + " data = []\n", + "\n", + " summary_metrics = eval_result.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " data.append(\n", + " go.Bar(\n", + " x=list(summary_metrics.keys()),\n", + " y=list(summary_metrics.values()),\n", + " name=title,\n", + " )\n", + " )\n", + "\n", + " fig = go.Figure(data=data)\n", + "\n", + " # Change the bar mode\n", + " fig.update_layout(barmode=\"group\")\n", + " fig.show()\n", + "\n", + "def display_radar_plot(eval_results, title: str, metrics=None):\n", + " \"\"\"Plot the radar plot.\"\"\"\n", + " fig = go.Figure()\n", + " summary_metrics = eval_results.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " min_val = min(summary_metrics.values())\n", + " max_val = max(summary_metrics.values())\n", + "\n", + " fig.add_trace(\n", + " go.Scatterpolar(\n", + " r=list(summary_metrics.values()),\n", + " theta=list(summary_metrics.keys()),\n", + " fill=\"toself\",\n", + " name=title,\n", + " )\n", + " )\n", + " fig.update_layout(\n", + " title=title,\n", + " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", + " showlegend=True\n", + " )\n", + " fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bDaa2Mtsifmq" + }, + "source": [ + "## Build LangGraph agent\n", + "\n", + "Build your application using LangGraph, including the Gemini model, custom tools that you define and a router to control the conversational flow." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KHwShhpOitKp" + }, + "source": [ + "### Set tools\n", + "\n", + "To start, set the tools that a customer support agent needs to do their job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gA2ZKvfeislw" + }, + "outputs": [], + "source": [ + "@tool\n", + "def get_product_details(product_name: str):\n", + " \"\"\"Gathers basic details about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", + " \"usb charger\": \"A super fast and light usb charger\",\n", + " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", + " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", + " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " }\n", + " return details.get(product_name, \"Product details not found.\")\n", + "\n", + "\n", + "@tool\n", + "def get_product_price(product_name: str):\n", + " \"\"\"Gathers price about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": 500,\n", + " \"usb charger\": 10,\n", + " \"shoes\": 100,\n", + " \"headphones\": 50,\n", + " \"speaker\": 80,\n", + " }\n", + " return details.get(product_name, \"Product price not found.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "be70714d9fae" + }, + "source": [ + "### Define router\n", + "\n", + "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "516b5108d327" + }, + "outputs": [], + "source": [ + "def router(state: list[BaseMessage]) -> Literal[\"get_product_details\", \"get_product_price\", \"__end__\"]:\n", + " \"\"\"Initiates product details or price retrieval if the user asks for a product.\"\"\"\n", + " # Get the tool_calls from the last message in the conversation history.\n", + " tool_calls = state[-1].tool_calls\n", + "\n", + " # If there are any tool_calls\n", + " if tool_calls:\n", + " # Check the function name in the first tool call\n", + " function_name = tool_calls[0].get('name')\n", + " if function_name == \"get_product_price\":\n", + " return \"get_product_price\"\n", + " else:\n", + " return \"get_product_details\"\n", + " else:\n", + " # End the conversation flow.\n", + " return \"__end__\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l4mk5XPui4Y1" + }, + "source": [ + "### Set the model\n", + "\n", + "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BaYeo6K2i-w1" + }, + "outputs": [], + "source": [ + "model = \"gemini-1.5-pro\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNlAY9cojEWz" + }, + "source": [ + "### Assemble the agent\n", + "\n", + "The Vertex AI GenAI Evaluation works directly with 'Queryable' agents, and also lets you add your own custom functions with a specific structure (signature).\n", + "\n", + "In this case, you assemble the agent using a custom function. The function triggers the agent for a given input and parse the agent outcome to extract the response and called tools." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dAFdi7SujGP8" + }, + "outputs": [], + "source": [ + "def agent_parsed_outcome(input):\n", + "\n", + " model = ChatVertexAI(model=model)\n", + " builder = MessageGraph()\n", + "\n", + " model_with_tools = model.bind_tools([get_product_details, get_product_price])\n", + " builder.add_node(\"tools\", model_with_tools)\n", + "\n", + " tool_node = ToolNode([get_product_details, get_product_price])\n", + " builder.add_node(\"get_product_details\", tool_node)\n", + " builder.add_node(\"get_product_price\", tool_node)\n", + " builder.add_edge(\"get_product_details\", END)\n", + " builder.add_edge(\"get_product_price\", END)\n", + "\n", + " builder.set_entry_point(\"tools\")\n", + " builder.add_conditional_edges(\"tools\", router)\n", + "\n", + " app = builder.compile()\n", + " chat_history = langchain_load_dump.dumpd(app.invoke(HumanMessage(input)))\n", + " return parse_messages_to_output_dictionary(chat_history)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_HGcs6PVjRj_" + }, + "source": [ + "### Test the agent\n", + "\n", + "Query your agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 190 + }, + "executionInfo": { + "elapsed": 1956, + "status": "ok", + "timestamp": 1734466947459, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "lGb58OJkjUs9", + "outputId": "3142f55a-324f-4a6b-b7fa-d7aee65efff4" + }, + "outputs": [], + "source": [ + "response = agent_parsed_outcome(input=\"Get product details for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 190 + }, + "executionInfo": { + "elapsed": 1543, + "status": "ok", + "timestamp": 1734466950133, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "2wCFstt8w4Dx", + "outputId": "2ddf9be5-4a48-4c17-dd89-b2b4fc1698de" + }, + "outputs": [], + "source": [ + "response = agent_parsed_outcome(input=\"Get product price for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aOGPePsorpUl" + }, + "source": [ + "## Evaluating a LangGraph agent with Vertex AI Gen AI Evaluation\n", + "\n", + "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", + "\n", + "Monitoring focuses on how well your agent is performing specific tasks:\n", + "\n", + "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", + "\n", + "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", + "\n", + "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", + "\n", + "Observability is about understanding the overall health of the agent:\n", + "\n", + "* **Latency**: How long does it take the agent to respond?\n", + "\n", + "* **Failure Rate**: How often does the agent fail to produce a response?\n", + "\n", + "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e43229f3ad4f" + }, + "source": [ + "### Prepare Agent Evaluation dataset\n", + "\n", + "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", + "\n", + "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", + "\n", + "> Optionally, you can provide both generated responses and predicted trajectory (**Bring-Your-Own-Dataset scenario**).\n", + "\n", + "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fFf8uTdUiDt3" + }, + "outputs": [], + "source": [ + "eval_data = {\n", + " 'prompt': [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\"\n", + " ],\n", + " 'reference_trajectory': [\n", + " [{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}],\n", + " [\n", + " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}},\n", + " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}\n", + " ],\n", + " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}],\n", + " [\n", + " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'shoes'}},\n", + " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'shoes'}}\n", + " ],\n", + " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'speaker'}}]\n", + " ]\n", + "}\n", + "\n", + "eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PQEI1EcfvFHb" + }, + "source": [ + "Print some samples from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 273 + }, + "executionInfo": { + "elapsed": 249, + "status": "ok", + "timestamp": 1734467121881, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "EjsonqWWvIvE", + "outputId": "a3bcc38c-0888-4f0b-8863-b1fbb7b43e53" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(eval_sample_dataset, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m4CvBuf1afHG" + }, + "source": [ + "### Single tool usage evaluation\n", + "\n", + "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_rS5GGKHd5bx" + }, + "source": [ + "#### Set single tool usage metrics\n", + "\n", + "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", + "\n", + "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xixvq8dwd5by" + }, + "outputs": [], + "source": [ + "single_tool_usage_metrics = [\n", + " TrajectorySingleToolUse(tool_name='get_product_price')\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ktKZoT2Qd5by" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using agent_parsed_outcome function and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 647 + }, + "executionInfo": { + "elapsed": 14596, + "status": "ok", + "timestamp": 1734467151419, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "SRv43fDcd5by", + "outputId": "3a67f9ec-ad45-46f1-febc-909ccf6f9512" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"\n", + "\n", + "single_tool_call_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=single_tool_usage_metrics,\n", + " experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(runnable=agent_parsed_outcome,\n", + " experiment_run_name=EXPERIMENT_RUN)\n", + "\n", + "display_eval_report(single_tool_call_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6o5BjSTFKVMS" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Use some helper functions to visualize a sample of evaluation result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 794 + }, + "executionInfo": { + "elapsed": 354, + "status": "ok", + "timestamp": 1734467151750, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "1Jopzw83k14w", + "outputId": "d0431283-5663-479d-c71f-8a9e2be005a7" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JlujdJpu5Kn6" + }, + "source": [ + "### Trajectory Evaluation\n", + "\n", + "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8s-nHdDJneHM" + }, + "source": [ + "#### Set trajectory metrics\n", + "\n", + "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", + "\n", + "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", + "\n", + "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", + "\n", + "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", + "\n", + "* `trajectory_precision`: proportion of predicted actions present in reference\n", + "\n", + "* `trajectory_recall`: proportion of reference actions present in predicted. \n", + "\n", + "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c32WIS95neHN" + }, + "outputs": [], + "source": [ + "trajectory_metrics = [\n", + " \"trajectory_exact_match\", \"trajectory_in_order_match\", \"trajectory_any_order_match\", \"trajectory_precision\", \"trajectory_recall\"\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF3jhTH3neHN" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 925 + }, + "executionInfo": { + "elapsed": 33448, + "status": "ok", + "timestamp": 1734467192781, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "vOdS7TJUneHN", + "outputId": "b089778d-5d04-4b66-aa55-606e3b234ddf" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"trajectory-{get_id()}\"\n", + "\n", + "trajectory_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=trajectory_metrics,\n", + " experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=agent_parsed_outcome,\n", + " experiment_run_name=EXPERIMENT_RUN)\n", + "\n", + "display_eval_report(trajectory_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DBiUI3LyLBtj" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Print and visualize a sample of evaluation results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "executionInfo": { + "elapsed": 245, + "status": "ok", + "timestamp": 1734467193023, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "sLVRdN5llA0h", + "outputId": "61897eaf-ebb3-4f8e-8c4c-c589aeef795c" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 542 + }, + "executionInfo": { + "elapsed": 483, + "status": "ok", + "timestamp": 1734467194033, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "erYYZEaaTNjJ", + "outputId": "69198916-d617-4116-d02d-42d1e44ad0c1" + }, + "outputs": [], + "source": [ + "plot_bar_plot(trajectory_eval_result, title=\"Trajectory Metrics\", metrics=[f'{metric}/mean' for metric in trajectory_metrics])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T8TipU2akHEd" + }, + "source": [ + "### Evaluate final response\n", + "\n", + "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DeK-py7ykkDN" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", + "\n", + "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cyGHGgeVklvz" + }, + "outputs": [], + "source": [ + "response_metrics = [\n", + " 'safety', 'coherence'\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DaBJWcg1kn55" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 734 + }, + "executionInfo": { + "elapsed": 22438, + "status": "ok", + "timestamp": 1734467221724, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "wRb2EC_hknSD", + "outputId": "ae416e8a-12ed-4522-f9c1-b7c966242a9e" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"response-{get_id()}\"\n", + "\n", + "response_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=response_metrics,\n", + " experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "response_eval_result = response_eval_task.evaluate(runnable=agent_parsed_outcome,\n", + " experiment_run_name=EXPERIMENT_RUN)\n", + "\n", + "display_eval_report(response_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WOP9hW-rTUIU" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "executionInfo": { + "elapsed": 571, + "status": "ok", + "timestamp": 1734467222292, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "ZODTRuq2lF75", + "outputId": "0b1f1517-9211-413f-ba7b-ae4742c5337c" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ntRBK3Te6PEc" + }, + "source": [ + "### Evaluate generated response conditioned by tool choosing\n", + "\n", + "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", + "\n", + "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4bENwFcd6prX" + }, + "source": [ + "#### Define a custom metric\n", + "\n", + "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", + "\n", + "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "txGEHcg76riI" + }, + "outputs": [], + "source": [ + "criteria = {\n", + " \"Follows trajectory\": (\n", + " \"Evaluate whether the agent's response logically follows from the \"\n", + " \"sequence of actions it took. Consider these sub-points:\\n\"\n", + " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", + " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", + " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", + " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", + " )\n", + " }\n", + "\n", + "pointwise_rating_rubric = {\n", + " \"1\": \"Follows trajectory\",\n", + " \"0\": \"Does not follow trajectory\",\n", + "}\n", + "\n", + "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", + " criteria=criteria,\n", + " rating_rubric=pointwise_rating_rubric,\n", + " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8MJqXu0kikxd" + }, + "source": [ + "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 238, + "status": "ok", + "timestamp": 1734467234001, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "5EL7iEDMikNQ", + "outputId": "d9290dd7-bd93-4dd6-b8c9-e14c8a22d6e0" + }, + "outputs": [], + "source": [ + "print(response_follows_trajectory_prompt_template.prompt_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e1djVp7Fi4Yy" + }, + "source": [ + "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nx1xbZD87iMj" + }, + "outputs": [], + "source": [ + "response_follows_trajectory_metric = PointwiseMetric(\n", + " metric=\"response_follows_trajectory\",\n", + " metric_prompt_template=response_follows_trajectory_prompt_template,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1pmxLwTe7Ywv" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "Set new generated response evaluation metrics by including the custom metric.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wrsbVFDd7Ywv" + }, + "outputs": [], + "source": [ + "response_tool_metrics = [\n", + " \"trajectory_exact_match\", \"trajectory_in_order_match\", \"safety\", response_follows_trajectory_metric\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lo-Sza807Ywv" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 925 + }, + "executionInfo": { + "elapsed": 31118, + "status": "ok", + "timestamp": 1734467268809, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "_dkb4gSn7Ywv", + "outputId": "546e8a71-4161-40be-a61d-85f4031f07d7" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"response-over-tools-{get_id()}\"\n", + "\n", + "response_eval_tool_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "response_eval_tool_result = response_eval_tool_task.evaluate(runnable=agent_parsed_outcome,\n", + " experiment_run_name=EXPERIMENT_RUN)\n", + "\n", + "display_eval_report(response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AtOfIFi2j88g" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "executionInfo": { + "elapsed": 255, + "status": "ok", + "timestamp": 1734467269033, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "GH2YvXgLlLH7", + "outputId": "c1cca2ca-c91f-43af-f816-eb6c7231084f" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4nuUDP3a2eTB" + }, + "source": [ + "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", + "\n", + "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DRLKlmWd27PK" + }, + "source": [ + "### Bring your own evaluation dataset\n", + "\n", + "Define the evaluation dataset with the predicted trajectory and the generated response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y9hBgsg324Ej" + }, + "outputs": [], + "source": [ + "byod_eval_data = {\n", + " 'prompt': [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\"\n", + " ],\n", + " 'reference_trajectory': [\n", + " [{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}],\n", + " [\n", + " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}},\n", + " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}\n", + " ],\n", + " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}],\n", + " [\n", + " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'shoes'}},\n", + " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'shoes'}}\n", + " ],\n", + " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'speaker'}}]\n", + " ],\n", + " 'generated_trajectory': [\n", + " [{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}],\n", + " [\n", + " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}},\n", + " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}\n", + " ],\n", + " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}],\n", + " [\n", + " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'shoes'}},\n", + " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'shoes'}}\n", + " ],\n", + " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'speaker'}}]\n", + " ],\n", + " 'response': [500, 50, 'A super fast and light usb charger', 100, 'A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.']\n", + "\n", + "}\n", + "\n", + "byod_eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oEYmU2eJ7q-1" + }, + "source": [ + "### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 925 + }, + "executionInfo": { + "elapsed": 31603, + "status": "ok", + "timestamp": 1734467316380, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "wBD-4wpB7q-3", + "outputId": "48539d5f-cfcc-490c-9361-001d9c5655fb" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", + "\n", + "byod_response_eval_tool_task = EvalTask(\n", + " dataset=byod_eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(runnable=agent_parsed_outcome,\n", + " experiment_run_name=EXPERIMENT_RUN_NAME)\n", + "\n", + "display_eval_report(byod_response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9eU3LG6r7q-3" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "executionInfo": { + "elapsed": 432, + "status": "ok", + "timestamp": 1734467316808, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "pQFzmd2I7q-3", + "outputId": "6291a671-5d4b-47fc-ccbb-7beecb681498" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 542 + }, + "executionInfo": { + "elapsed": 3, + "status": "ok", + "timestamp": 1734467316808, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "DJr8GqQKTpUa", + "outputId": "763a002d-2170-4107-8588-9cfa0d462d63" + }, + "outputs": [], + "source": [ + "display_radar_plot(byod_response_eval_tool_result, title=\"Response Metrics\", metrics=[f'{metric}/mean' for metric in response_tool_metrics])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2a4e033321ad" + }, + "source": [ + "## Cleaning up\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 16979, + "status": "ok", + "timestamp": 1734430207912, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "Ox2I3UfRlTOd", + "outputId": "c608c294-0311-42f9-aae4-ca40befd159c" + }, + "outputs": [], + "source": [ + "delete_experiment=True\n", + "\n", + "if delete_experiment:\n", + " try:\n", + " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", + " experiment.delete(delete_backing_tensorboard_runs=True)\n", + " except Exception as e:\n", + " print(e)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [], + "toc_visible": true + }, + "environment": { + "kernel": "python3", + "name": "common-cpu.m126", + "type": "gcloud", + "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/base-cpu:m126" + }, + "kernelspec": { + "display_name": "Python 3 (Local)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb b/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb new file mode 100644 index 0000000000..146b976c63 --- /dev/null +++ b/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb @@ -0,0 +1,1914 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Evaluate an CrewAI agent on Vertex AI Reasoning Engine (Customized template)\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Google
Open in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Open in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "Share to:\n", + "\n", + "\n", + " \"LinkedIn\n", + "\n", + "\n", + "\n", + " \"Bluesky\n", + "\n", + "\n", + "\n", + " \"X\n", + "\n", + "\n", + "\n", + " \"Reddit\n", + "\n", + "\n", + "\n", + " \"Facebook\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "84f0f73a0f76" + }, + "source": [ + "| | |\n", + "|-|-|\n", + "| Authors | [Naveksha Sood](https://github.com/navekshasood) [Ivan Nardini](https://github.com/inardini) |" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", + "\n", + "This tutorial shows how to evaluate an CrewAI agent with customized template on Vertex AI Reasoning Engine using Vertex AI Gen AI Evaluation.\n", + "\n", + "The tutorial uses the following Google Cloud services and resources:\n", + "\n", + "* Vertex AI Gen AI Evaluation\n", + "* Vertex AI Reasoning Engine\n", + "\n", + "The steps performed include:\n", + "\n", + "* Build and deploy an agent using CrewAI on Vertex AI Reasoning Engine\n", + "* Prepare Agent Evaluation dataset\n", + "* Single tool usage evaluation\n", + "* Trajectory evaluation\n", + "* Response evaluation\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "61RBz8LLbxCR" + }, + "source": [ + "## Get started" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "No17Cw5hgx12" + }, + "source": [ + "### Install Vertex AI SDK and other required packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 86407, + "status": "ok", + "timestamp": 1734509697919, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "tFy3H3aPgx12", + "outputId": "23790fd1-31d0-4617-ee06-0338947224e0" + }, + "outputs": [], + "source": [ + "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation, reasoningengine]\" \\\n", + " \"crewai\" \"crewai-tools\" \\\n", + " \"cloudpickle==3.0.0\" \\\n", + " \"pydantic==2.7.4\" \\\n", + " \"requests\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R5Xep4W9lq-Z" + }, + "source": [ + "### Restart runtime\n", + "\n", + "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", + "\n", + "The restart might take a minute or longer. After it's restarted, continue to the next step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 247, + "status": "ok", + "timestamp": 1734509760437, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "XRvKdaPDTznN", + "outputId": "11a3a8d8-dbbe-4ebb-b17b-bfbbabed07f9" + }, + "outputs": [], + "source": [ + "import IPython\n", + "\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SbmM4z7FOBpM" + }, + "source": [ + "
\n", + "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dmWOrTJ3gx13" + }, + "source": [ + "### Authenticate your notebook environment (Colab only)\n", + "\n", + "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NyKGtVQjgx13" + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " from google.colab import auth\n", + "\n", + " auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF4l8DTdWgPY" + }, + "source": [ + "### Set Google Cloud project information and initialize Vertex AI SDK\n", + "\n", + "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 112 + }, + "executionInfo": { + "elapsed": 9679, + "status": "ok", + "timestamp": 1734509790791, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "Nqwi-5ufWp_B", + "outputId": "e959f3e6-eca7-4d49-f757-e006b03e3ef1" + }, + "outputs": [], + "source": [ + "# Use the environment variable if the user doesn't provide Project ID.\n", + "import os\n", + "\n", + "import vertexai\n", + "\n", + "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", + "\n", + "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", + " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", + "\n", + "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", + "\n", + "BUCKET_NAME = \"[your-bucket-name]\" # @param {type: \"string\", placeholder: \"[your-bucket-name]\", isTemplate: true}\n", + "\n", + "if not BUCKET_NAME or BUCKET_NAME == \"[your-bucket-name]\":\n", + " BUCKET_NAME = f\"{PROJECT_ID}-bucket\"\n", + "\n", + "BUCKET_URI = f\"gs://evaluate_agents\"\n", + "\n", + "! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI\n", + "\n", + "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", + "\n", + "vertexai.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI, experiment=EXPERIMENT_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5303c05f7aa6" + }, + "source": [ + "## Import libraries\n", + "\n", + "Import tutorial libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6fc324893334" + }, + "outputs": [], + "source": [ + "# General\n", + "import random\n", + "import string\n", + "import json\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "from IPython.display import display, Markdown, HTML\n", + "from typing import Callable, Sequence, TypedDict, Annotated, Literal\n", + "\n", + "# Build agent\n", + "from crewai import Agent, Task, Crew, Process\n", + "from crewai_tools import tool\n", + "from crewai.flow.flow import Flow, listen, start\n", + "from vertexai.preview import reasoning_engines\n", + "\n", + "# Evaluate agent\n", + "from google.cloud import aiplatform\n", + "from vertexai.preview.evaluation import EvalTask\n", + "from vertexai.preview.evaluation.metrics import TrajectorySingleToolUse, PointwiseMetricPromptTemplate, PointwiseMetric" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MVnBDX54gz7j" + }, + "source": [ + "## Define helper functions\n", + "\n", + "Initiate a set of helper functions to print tutorial results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uSgWjMD_g1_v" + }, + "outputs": [], + "source": [ + "def get_id(length: int = 8) -> str:\n", + " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", + " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", + "\n", + "def parse_crewai_output_to_dictionary(crew, crew_output):\n", + " \"\"\"\n", + " Parse CrewAI output into a structured dictionary format.\n", + " \"\"\"\n", + " final_output = {\n", + " 'response': str(crew_output),\n", + " 'predicted_trajectory': []\n", + " }\n", + "\n", + " try:\n", + " # Access tools_results directly from each agent\n", + " for agent in crew.agents:\n", + " if hasattr(agent, 'tools_results'):\n", + " for tool_result in agent.tools_results:\n", + " tool_info = {\n", + " 'tool_name': tool_result.get('tool_name', ''),\n", + " 'tool_input': tool_result.get('tool_args', {})\n", + " }\n", + " final_output['predicted_trajectory'].append(tool_info)\n", + "\n", + " except Exception as e:\n", + " final_output['error'] = f\"Error parsing tools results: {str(e)}\"\n", + "\n", + " return final_output\n", + "\n", + "def format_output_as_markdown(output: dict) -> str:\n", + " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", + " markdown = \"### AI Response\\n\"\n", + " markdown += f\"{output['response']}\\n\\n\"\n", + "\n", + " if output['predicted_trajectory']:\n", + " markdown += \"### Function Calls\\n\"\n", + " for call in output['predicted_trajectory']:\n", + " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", + " markdown += \" - **Arguments**:\\n\"\n", + " for key, value in call['tool_input'].items():\n", + " markdown += f\" - `{key}`: `{value}`\\n\"\n", + "\n", + " return markdown\n", + "\n", + "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", + " \"\"\"Display the evaluation results.\"\"\"\n", + " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient='index').T\n", + " display(Markdown(\"### Summary Metrics\"))\n", + " display(metrics_df)\n", + "\n", + " display(Markdown(f\"### Row-wise Metrics\"))\n", + " display(eval_result.metrics_table)\n", + "\n", + "def display_drilldown(row: pd.Series) -> None:\n", + " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", + "\n", + " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", + "\n", + " if not (isinstance(row['predicted_trajectory'], list) and isinstance(row['reference_trajectory'], list)):\n", + " return\n", + "\n", + " for predicted_trajectory, reference_trajectory in zip(row['predicted_trajectory'], row['reference_trajectory']):\n", + " display(HTML(f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"))\n", + "\n", + "\n", + " if not (isinstance(predicted_trajectory.get('tool_input'), dict) and isinstance(reference_trajectory.get('tool_input'), dict)):\n", + " continue\n", + "\n", + "\n", + " for tool_input_key in predicted_trajectory['tool_input']:\n", + " print(\"Tool Input Key: \", tool_input_key)\n", + "\n", + " if tool_input_key in reference_trajectory['tool_input']:\n", + " print(\"Tool Values: \", predicted_trajectory['tool_input'][tool_input_key], reference_trajectory['tool_input'][tool_input_key])\n", + " else:\n", + " print(\"Tool Values: \", predicted_trajectory['tool_input'][tool_input_key], \"N/A\")\n", + " print(\"\\n\")\n", + " display(HTML(\"
\"))\n", + "\n", + "def display_dataframe_rows(\n", + " df: pd.DataFrame,\n", + " columns: list[str] | None = None,\n", + " num_rows: int = 3,\n", + " display_drilldown: bool = False\n", + ") -> None:\n", + " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", + "\n", + " if columns:\n", + " df = df[columns]\n", + "\n", + " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", + " header_style = base_style + \"font-weight: bold;\"\n", + "\n", + " for _, row in df.head(num_rows).iterrows():\n", + " for column in df.columns:\n", + " display(HTML(f\"{column.replace('_', ' ').title()}: \"))\n", + " display(HTML(f\"{row[column]}
\"))\n", + "\n", + " display(HTML(\"
\"))\n", + "\n", + " if display_drilldown and 'predicted_trajectory' in df.columns and 'reference_trajectory' in df.columns:\n", + " display_drilldown(row)\n", + "\n", + "def plot_bar_plot(\n", + " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", + ") -> None:\n", + " fig = go.Figure()\n", + " data = []\n", + "\n", + " summary_metrics = eval_result.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " data.append(\n", + " go.Bar(\n", + " x=list(summary_metrics.keys()),\n", + " y=list(summary_metrics.values()),\n", + " name=title,\n", + " )\n", + " )\n", + "\n", + " fig = go.Figure(data=data)\n", + "\n", + " # Change the bar mode\n", + " fig.update_layout(barmode=\"group\")\n", + " fig.show()\n", + "\n", + "def display_radar_plot(eval_results, title: str, metrics=None):\n", + " \"\"\"Plot the radar plot.\"\"\"\n", + " fig = go.Figure()\n", + " summary_metrics = eval_results.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " min_val = min(summary_metrics.values())\n", + " max_val = max(summary_metrics.values())\n", + "\n", + " fig.add_trace(\n", + " go.Scatterpolar(\n", + " r=list(summary_metrics.values()),\n", + " theta=list(summary_metrics.keys()),\n", + " fill=\"toself\",\n", + " name=title,\n", + " )\n", + " )\n", + " fig.update_layout(\n", + " title=title,\n", + " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", + " showlegend=True\n", + " )\n", + " fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bDaa2Mtsifmq" + }, + "source": [ + "## Build an agent using Vertex AI Reasoning Engine's customized template\n", + "\n", + "Build your application using CrewAI, including the Gemini model and custom tools that you define.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KHwShhpOitKp" + }, + "source": [ + "### Set tools\n", + "\n", + "To start, set the tools that a customer support agent needs to do their job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gA2ZKvfeislw" + }, + "outputs": [], + "source": [ + "@tool\n", + "def get_product_details(product_name: str):\n", + " \"\"\"Gathers basic details about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", + " \"usb charger\": \"A super fast and light usb charger\",\n", + " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", + " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", + " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " }\n", + " return details.get(product_name, \"Product details not found.\")\n", + "\n", + "\n", + "@tool\n", + "def get_product_price(product_name: str):\n", + " \"\"\"Gathers price about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": 500,\n", + " \"usb charger\": 10,\n", + " \"shoes\": 100,\n", + " \"headphones\": 50,\n", + " \"speaker\": 80,\n", + " }\n", + " return details.get(product_name, \"Product price not found.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "be70714d9fae" + }, + "source": [ + "### Define router using Flow\n", + "\n", + "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "516b5108d327" + }, + "outputs": [], + "source": [ + "class ProductFlow(Flow):\n", + " @start\n", + " def begin_flow(self):\n", + " \"\"\"Starts the product information flow\"\"\"\n", + " return \"check_request\"\n", + "\n", + " @listen(\"check_request\")\n", + " def router(self, state: dict) -> str:\n", + " \"\"\"Routes the product request to appropriate handler\"\"\"\n", + " # Get the last message from the state\n", + " last_message = state.get(\"last_message\", {})\n", + " tool_calls = last_message.get(\"tool_calls\", [])\n", + "\n", + " if tool_calls:\n", + " function_name = tool_calls[0].get('name')\n", + " if function_name == \"get_product_price\":\n", + " return \"get_product_price\"\n", + " else:\n", + " return \"get_product_details\"\n", + " return \"end\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FHjhBVx2cHWb" + }, + "source": [ + "### Set the model\n", + "\n", + "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iCx9hbpccHWc" + }, + "outputs": [], + "source": [ + "model = \"vertex_ai/gemini-1.5-pro-002\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNlAY9cojEWz" + }, + "source": [ + "### Assemble the agent\n", + "\n", + "To create a CrewAI agent using [Vertex AI Reasoning Engine](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy), use the [customized template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize).\n", + "\n", + "This class helps you quickly get an agent using any framework running Vertex AI Reasoning Engine.\n", + "\n", + "To learn more about the template, check out [Customize an application template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize) documentation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dAFdi7SujGP8" + }, + "outputs": [], + "source": [ + "class CrewAIApp:\n", + " def __init__(self, project: str, location: str, model: str = model) -> None:\n", + " self.project_id = project\n", + " self.location = location\n", + " self.model = model\n", + "\n", + " # The set_up method is used to define application initialization logic\n", + " def set_up(self) -> None:\n", + " \"\"\"Set up the application.\"\"\"\n", + " os.environ['GOOGLE_CLOUD_PROJECT'] = self.project_id\n", + " return\n", + "\n", + " # The query method will be used to send inputs to the agent\n", + " def query(self, input: str):\n", + " \"\"\"Query the application.\"\"\"\n", + " product_researcher = Agent(\n", + " role='Product Researcher',\n", + " goal='Research product details and prices accurately',\n", + " backstory='Expert at gathering and analyzing product information',\n", + " llm=model,\n", + " tools=[get_product_details, get_product_price],\n", + " allow_delegation=False\n", + " )\n", + "\n", + " research_task = Task(\n", + " description=f\"Analyze this user request: '{input}'. \"\n", + " f\"If the request is about price, use get_product_price tool. \"\n", + " f\"Otherwise, use get_product_details tool to get product information.\",\n", + " expected_output=\"Product information including details and/or price based on the user request.\",\n", + " agent=product_researcher,\n", + " )\n", + "\n", + " crew = Crew(\n", + " agents=[product_researcher],\n", + " tasks=[research_task],\n", + " process=Process.sequential,\n", + " )\n", + "\n", + " result = crew.kickoff()\n", + " return parse_crewai_output_to_dictionary(crew, result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_HGcs6PVjRj_" + }, + "source": [ + "### Test the local agent\n", + "\n", + "Query your agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1dXLLgBudu_L" + }, + "outputs": [], + "source": [ + "local_custom_agent = CrewAIApp(project=PROJECT_ID, location=LOCATION)\n", + "local_custom_agent.set_up()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 190 + }, + "executionInfo": { + "elapsed": 7192, + "status": "ok", + "timestamp": 1734510146357, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "PgkOhPmN3aCZ", + "outputId": "6e2752f9-b237-4f0b-e04a-33fb0e7be373" + }, + "outputs": [], + "source": [ + "response = local_custom_agent.query(input=\"Get product details for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 207 + }, + "executionInfo": { + "elapsed": 3279, + "status": "ok", + "timestamp": 1734510149635, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "lGb58OJkjUs9", + "outputId": "ba74d03c-364c-42e9-847e-8d819f19836b" + }, + "outputs": [], + "source": [ + "response = local_custom_agent.query(input=\"Get product price for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2pSItXD5e4QD" + }, + "source": [ + "### Deploy the local agent to Vertex AI Reasoning Engine\n", + "\n", + "To deploy the local agent on Vertex AI Reasoning Engine, you can use the `create` method by passing the agent and some specify dependencies (`requirements` for external PyPI packages and `extra_packages` for local packages ).\n", + "\n", + "Look at [Deploy the application](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy#create_a_reasoningengine_instance) documentation page to learn more. \n", + "\n", + "> The agent deployment on Vertex AI Reasoning Engine would require ~ 10 mins." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 413622, + "status": "ok", + "timestamp": 1734510599752, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "3HLz_a1We4QE", + "outputId": "70a0a6f2-5891-4e57-ce34-2da339e4978c" + }, + "outputs": [], + "source": [ + "local_custom_agent = CrewAIApp(project=PROJECT_ID, location=LOCATION)\n", + "\n", + "remote_custom_agent = reasoning_engines.ReasoningEngine.create(\n", + " local_custom_agent,\n", + " requirements=[\n", + " \"google-cloud-aiplatform[reasoningengine]\",\n", + " \"crewai\",\n", + " \"crewai-tools\",\n", + " \"cloudpickle==3.0.0\",\n", + " \"pydantic==2.7.4\",\n", + " \"requests\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nu4RO1P9e4QE" + }, + "source": [ + "### Test the remote agent\n", + "\n", + "Query your remote agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 190 + }, + "executionInfo": { + "elapsed": 5343, + "status": "ok", + "timestamp": 1734510633271, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "sqBtzYyce4QE", + "outputId": "ebef6581-b843-47e4-cfe5-f95e6ba80f4f" + }, + "outputs": [], + "source": [ + "response = remote_custom_agent.query(input=\"Get product details for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aOGPePsorpUl" + }, + "source": [ + "## Evaluating a custom agent with Vertex AI Gen AI Evaluation\n", + "\n", + "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", + "\n", + "Monitoring focuses on how well your agent is performing specific tasks:\n", + "\n", + "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", + "\n", + "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", + "\n", + "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", + "\n", + "Observability is about understanding the overall health of the agent:\n", + "\n", + "* **Latency**: How long does it take the agent to respond?\n", + "\n", + "* **Failure Rate**: How often does the agent fail to produce a response?\n", + "\n", + "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e43229f3ad4f" + }, + "source": [ + "### Prepare Agent Evaluation dataset\n", + "\n", + "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", + "\n", + "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", + "\n", + "\n", + "> Optionally, you can provide both generated responses and predicted trajectory (**bring-your-own-dataset scenario**).\n", + "\n", + "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fFf8uTdUiDt3" + }, + "outputs": [], + "source": [ + "eval_data = {\n", + " 'prompt': [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\"\n", + " ],\n", + " 'reference_trajectory': [\n", + " [{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}],\n", + " [\n", + " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}},\n", + " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}\n", + " ],\n", + " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}],\n", + " [\n", + " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'shoes'}},\n", + " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'shoes'}}\n", + " ],\n", + " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'speaker'}}]\n", + " ]\n", + "}\n", + "\n", + "eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PQEI1EcfvFHb" + }, + "source": [ + "Print some samples from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 273 + }, + "executionInfo": { + "elapsed": 331, + "status": "ok", + "timestamp": 1734510641426, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "EjsonqWWvIvE", + "outputId": "5cd251f4-8990-4b92-f8f9-1b3d2b97e626" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(eval_sample_dataset, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "htCrOS9fRVi8" + }, + "source": [ + "### Prepare an Agent function\n", + "\n", + "In this scenario with a custom agent, you need an agent function to pass the agent output to Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GdO56MIDRZri" + }, + "outputs": [], + "source": [ + "def agent_parsed_response(input: str) -> dict:\n", + " \"\"\"Pass the agent output to Vertex AI Gen AI Evaluation.\"\"\"\n", + "\n", + " result = remote_custom_agent.query(input=input)\n", + "\n", + " return result" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m4CvBuf1afHG" + }, + "source": [ + "### Single tool usage evaluation\n", + "\n", + "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_rS5GGKHd5bx" + }, + "source": [ + "#### Set single tool usage metrics\n", + "\n", + "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", + "\n", + "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xixvq8dwd5by" + }, + "outputs": [], + "source": [ + "single_tool_usage_metrics = [\n", + " TrajectorySingleToolUse(tool_name='get_product_price')\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ktKZoT2Qd5by" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using a local agent (local_1p_agent) and assigns a unique identifier to this specific evaluation run, storing the evaluation results.\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QaMf9dqzySE6" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 700 + }, + "executionInfo": { + "elapsed": 35014, + "status": "ok", + "timestamp": 1734510766421, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "SRv43fDcd5by", + "outputId": "47bd1ae5-d484-4c36-ccf6-0e79ab83c108" + }, + "outputs": [], + "source": [ + "single_tool_call_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=single_tool_usage_metrics,\n", + " experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(runnable=agent_parsed_response,\n", + " experiment_run_name=EXPERIMENT_RUN)\n", + "\n", + "display_eval_report(single_tool_call_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6o5BjSTFKVMS" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Use some helper functions to visualize a sample of evaluation result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 794 + }, + "executionInfo": { + "elapsed": 233, + "status": "ok", + "timestamp": 1734510771445, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "ZkpwPReipekr", + "outputId": "bf3e0f46-e2b5-4270-9d2f-c576e1f52519" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JlujdJpu5Kn6" + }, + "source": [ + "### Trajectory Evaluation\n", + "\n", + "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8s-nHdDJneHM" + }, + "source": [ + "#### Set trajectory metrics\n", + "\n", + "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", + "\n", + "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", + "\n", + "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", + "\n", + "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", + "\n", + "* `trajectory_precision`: proportion of predicted actions present in reference\n", + "\n", + "* `trajectory_recall`: proportion of reference actions present in predicted. \n", + "\n", + "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c32WIS95neHN" + }, + "outputs": [], + "source": [ + "trajectory_metrics = [\n", + " \"trajectory_exact_match\", \"trajectory_in_order_match\", \"trajectory_any_order_match\", \"trajectory_precision\", \"trajectory_recall\"\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF3jhTH3neHN" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 925 + }, + "executionInfo": { + "elapsed": 41311, + "status": "ok", + "timestamp": 1734510817099, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "vOdS7TJUneHN", + "outputId": "51ece05f-b9ff-46e0-b99f-f9b2f4be5c31" + }, + "outputs": [], + "source": [ + "EXPERIMENT_NAME = \"evaluate-re-agent-trajectory\" # @param {type:\"string\"}\n", + "\n", + "trajectory_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=trajectory_metrics,\n", + " experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=agent_parsed_response)\n", + "\n", + "display_eval_report(trajectory_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DBiUI3LyLBtj" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Print and visualize a sample of evaluation results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "executionInfo": { + "elapsed": 243, + "status": "ok", + "timestamp": 1734510817339, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "sLVRdN5llA0h", + "outputId": "588eee63-64eb-4f47-9bc8-41978fcef599" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 542 + }, + "executionInfo": { + "elapsed": 459, + "status": "ok", + "timestamp": 1734510817797, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "PrxM5sMZYXHP", + "outputId": "35a593ee-c99f-4b74-81b8-05a53d80263c" + }, + "outputs": [], + "source": [ + "plot_bar_plot(trajectory_eval_result, title=\"Trajectory Metrics\", metrics=[f'{metric}/mean' for metric in trajectory_metrics])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T8TipU2akHEd" + }, + "source": [ + "### Evaluate final response\n", + "\n", + "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DeK-py7ykkDN" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", + "\n", + "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cyGHGgeVklvz" + }, + "outputs": [], + "source": [ + "response_metrics = [\n", + " 'safety', 'coherence'\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DaBJWcg1kn55" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 943 + }, + "executionInfo": { + "elapsed": 31734, + "status": "ok", + "timestamp": 1734510849530, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "wRb2EC_hknSD", + "outputId": "d5c6f65c-405a-463f-bb38-01112871c66d" + }, + "outputs": [], + "source": [ + "EXPERIMENT_NAME = \"evaluate-re-agent-response\" # @param {type:\"string\"}\n", + "\n", + "response_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=response_metrics,\n", + " experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "response_eval_result = response_eval_task.evaluate(runnable=agent_parsed_response)\n", + "\n", + "display_eval_report(response_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JtewTwiwg9qH" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "executionInfo": { + "elapsed": 276, + "status": "ok", + "timestamp": 1734510849797, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "cy0aRydrp9zW", + "outputId": "ed3add5b-03ef-4591-8d2c-eead865c017b" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ntRBK3Te6PEc" + }, + "source": [ + "### Evaluate generated response conditioned by tool choosing\n", + "\n", + "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", + "\n", + "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4bENwFcd6prX" + }, + "source": [ + "#### Define a custom metric\n", + "\n", + "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", + "\n", + "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "txGEHcg76riI" + }, + "outputs": [], + "source": [ + "criteria = {\n", + " \"Follows trajectory\": (\n", + " \"Evaluate whether the agent's response logically follows from the \"\n", + " \"sequence of actions it took. Consider these sub-points:\\n\"\n", + " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", + " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", + " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", + " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", + " )\n", + " }\n", + "\n", + "pointwise_rating_rubric = {\n", + " \"1\": \"Follows trajectory\",\n", + " \"0\": \"Does not follow trajectory\",\n", + "}\n", + "\n", + "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", + " criteria=criteria,\n", + " rating_rubric=pointwise_rating_rubric,\n", + " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8MJqXu0kikxd" + }, + "source": [ + "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 3, + "status": "ok", + "timestamp": 1734510849797, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "5EL7iEDMikNQ", + "outputId": "500f3026-cb93-44b6-d5fb-d96ab863444a" + }, + "outputs": [], + "source": [ + "print(response_follows_trajectory_prompt_template.prompt_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e1djVp7Fi4Yy" + }, + "source": [ + "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nx1xbZD87iMj" + }, + "outputs": [], + "source": [ + "response_follows_trajectory_metric = PointwiseMetric(\n", + " metric=\"response_follows_trajectory\",\n", + " metric_prompt_template=response_follows_trajectory_prompt_template,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1pmxLwTe7Ywv" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "Set new generated response evaluation metrics by including the custom metric.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wrsbVFDd7Ywv" + }, + "outputs": [], + "source": [ + "response_tool_metrics = [\n", + " \"trajectory_exact_match\", \"trajectory_in_order_match\", \"safety\", response_follows_trajectory_metric\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lo-Sza807Ywv" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 925 + }, + "executionInfo": { + "elapsed": 41502, + "status": "ok", + "timestamp": 1734510891298, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "_dkb4gSn7Ywv", + "outputId": "02d10b75-d728-4520-ed10-bde10a1da2dc" + }, + "outputs": [], + "source": [ + "EXPERIMENT_NAME = \"evaluate-re-agent-response-by-tools\" # @param {type:\"string\"}\n", + "\n", + "response_eval_tool_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "response_eval_tool_result = response_eval_tool_task.evaluate(runnable=agent_parsed_response)\n", + "\n", + "display_eval_report(response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EFmnRBlWqJnC" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "executionInfo": { + "elapsed": 377, + "status": "ok", + "timestamp": 1734510891610, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "ZODTRuq2lF75", + "outputId": "68651fed-5017-4ed1-a85e-3e2ec3b49c05" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4nuUDP3a2eTB" + }, + "source": [ + "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", + "\n", + "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pNh3-NDuZGDl" + }, + "source": [ + "### Bring your own evaluation dataset\n", + "\n", + "Define the evaluation dataset with the predicted trajectory and the generated response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y9hBgsg324Ej" + }, + "outputs": [], + "source": [ + "byod_eval_data = {\n", + " 'prompt': [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\"\n", + " ],\n", + " 'reference_trajectory': [\n", + " [{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}],\n", + " [\n", + " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}},\n", + " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}\n", + " ],\n", + " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}],\n", + " [\n", + " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'shoes'}},\n", + " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'shoes'}}\n", + " ],\n", + " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'speaker'}}]\n", + " ],\n", + " 'generated_trajectory': [\n", + " [{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}],\n", + " [\n", + " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}},\n", + " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}\n", + " ],\n", + " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}],\n", + " [\n", + " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'shoes'}},\n", + " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'shoes'}}\n", + " ],\n", + " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'speaker'}}]\n", + " ],\n", + " 'response': [500, 50, 'A super fast and light usb charger', 100, 'A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.']\n", + "\n", + "}\n", + "\n", + "byod_eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oEYmU2eJ7q-1" + }, + "source": [ + "### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 925 + }, + "executionInfo": { + "elapsed": 40090, + "status": "ok", + "timestamp": 1734510931695, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "wBD-4wpB7q-3", + "outputId": "082a5d56-c5ad-4a3b-8f2b-8f3902d2892d" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", + "\n", + "byod_response_eval_tool_task = EvalTask(\n", + " dataset=byod_eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(runnable=agent_parsed_response,\n", + " experiment_run_name=EXPERIMENT_RUN_NAME)\n", + "\n", + "display_eval_report(byod_response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9eU3LG6r7q-3" + }, + "source": [ + "### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "executionInfo": { + "elapsed": 517, + "status": "ok", + "timestamp": 1734510932184, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "pQFzmd2I7q-3", + "outputId": "49c8127b-591c-4880-8380-3595dff0b52c" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 542 + }, + "executionInfo": { + "elapsed": 10, + "status": "ok", + "timestamp": 1734510932184, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "0FEbvEOkZS8f", + "outputId": "d0a8195f-dca9-4ff9-e48c-e81b172bec1a" + }, + "outputs": [], + "source": [ + "display_radar_plot(byod_response_eval_tool_result, title=\"Response Metrics\", metrics=[f'{metric}/mean' for metric in response_tool_metrics])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2a4e033321ad" + }, + "source": [ + "## Cleaning up\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ox2I3UfRlTOd" + }, + "outputs": [], + "source": [ + "delete_experiment=True\n", + "delete_remote_agent=True\n", + "\n", + "if delete_experiment:\n", + " try:\n", + " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", + " experiment.delete(delete_backing_tensorboard_runs=True)\n", + " except Exception as e:\n", + " print(e)\n", + "\n", + "if delete_remote_agent:\n", + " try:\n", + " remote_custom_agent.delete()\n", + " except Exception as e:\n", + " print(e)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [], + "toc_visible": true + }, + "environment": { + "kernel": "python3", + "name": "common-cpu.m126", + "type": "gcloud", + "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/base-cpu:m126" + }, + "kernelspec": { + "display_name": "Python 3 (Local)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb b/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb new file mode 100644 index 0000000000..cb2d983ab2 --- /dev/null +++ b/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb @@ -0,0 +1,1758 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Evaluating an LangChain Agent on Vertex AI Reasoning Engine (Prebuilt template)\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Google
Open in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Open in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "Share to:\n", + "\n", + "\n", + " \"LinkedIn\n", + "\n", + "\n", + "\n", + " \"Bluesky\n", + "\n", + "\n", + "\n", + " \"X\n", + "\n", + "\n", + "\n", + " \"Reddit\n", + "\n", + "\n", + "\n", + " \"Facebook\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "84f0f73a0f76" + }, + "source": [ + "| | |\n", + "|-|-|\n", + "| Authors | [Naveksha Sood](https://github.com/navekshasood) [Ivan Nardini](https://github.com/inardini) |" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", + "\n", + "This tutorial shows how to evaluate a first-party Reasoning Engine Agent using Vertex AI Gen AI Evaluation for agent evaluation.\n", + "\n", + "The tutorial uses the following Google Cloud services and resources:\n", + "\n", + "* Vertex AI Gen AI Evaluation\n", + "* Vertex AI Reasoning Engine\n", + "\n", + "The steps performed include:\n", + "\n", + "* Build and deploy an agent using LangChain\n", + "* Prepare Agent Evaluation dataset\n", + "* Single tool usage evaluation\n", + "* Trajectory evaluation\n", + "* Response evaluation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "61RBz8LLbxCR" + }, + "source": [ + "## Get started" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "No17Cw5hgx12" + }, + "source": [ + "### Install Vertex AI SDK and other required packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 22383, + "status": "ok", + "timestamp": 1734467377714, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "tFy3H3aPgx12", + "outputId": "14fde80a-429e-4894-d521-704678805ab9" + }, + "outputs": [], + "source": [ + "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation, langchain, reasoningengine]\" \\\n", + " \"langchain_google_vertexai\" \\\n", + " \"cloudpickle==3.0.0\" \\\n", + " \"pydantic==2.7.4\" \\\n", + " \"requests\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R5Xep4W9lq-Z" + }, + "source": [ + "### Restart runtime\n", + "\n", + "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", + "\n", + "The restart might take a minute or longer. After it's restarted, continue to the next step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 252, + "status": "ok", + "timestamp": 1734467630727, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "XRvKdaPDTznN", + "outputId": "f193ca30-cb73-400d-84c3-ce1842144ef0" + }, + "outputs": [], + "source": [ + "import IPython\n", + "\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SbmM4z7FOBpM" + }, + "source": [ + "
\n", + "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dmWOrTJ3gx13" + }, + "source": [ + "### Authenticate your notebook environment (Colab only)\n", + "\n", + "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NyKGtVQjgx13" + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " from google.colab import auth\n", + "\n", + " auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF4l8DTdWgPY" + }, + "source": [ + "### Set Google Cloud project information and initialize Vertex AI SDK\n", + "\n", + "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 112 + }, + "executionInfo": { + "elapsed": 14164, + "status": "ok", + "timestamp": 1734467672418, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "Nqwi-5ufWp_B", + "outputId": "649d75fc-01a5-45d4-8e36-f538205e8374" + }, + "outputs": [], + "source": [ + "# Use the environment variable if the user doesn't provide Project ID.\n", + "import os\n", + "\n", + "import vertexai\n", + "\n", + "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", + "\n", + "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", + " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", + "\n", + "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", + "\n", + "BUCKET_NAME = \"[your-bucket-name]\" # @param {type: \"string\", placeholder: \"[your-bucket-name]\", isTemplate: true}\n", + "BUCKET_URI = f\"gs://evaluate_agents\"\n", + "\n", + "if not BUCKET_NAME or BUCKET_NAME == \"[your-bucket-name]\":\n", + " BUCKET_NAME = f\"{PROJECT_ID}-bucket\"\n", + "\n", + "! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI\n", + "\n", + "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", + "\n", + "vertexai.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI, experiment=EXPERIMENT_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5303c05f7aa6" + }, + "source": [ + "## Import libraries\n", + "\n", + "Import tutorial libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6fc324893334" + }, + "outputs": [], + "source": [ + "# General\n", + "import random\n", + "import string\n", + "from IPython.display import display, Markdown, HTML\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "\n", + "# Build agent\n", + "from google.cloud import aiplatform\n", + "from vertexai.preview import reasoning_engines\n", + "\n", + "# Evaluate agent\n", + "from vertexai.preview.evaluation import EvalTask\n", + "from vertexai.preview.evaluation.metrics import TrajectorySingleToolUse, PointwiseMetricPromptTemplate, PointwiseMetric" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MVnBDX54gz7j" + }, + "source": [ + "## Define helper functions\n", + "\n", + "Initiate a set of helper functions to print tutorial results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uSgWjMD_g1_v" + }, + "outputs": [], + "source": [ + "def get_id(length: int = 8) -> str:\n", + " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", + " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", + "\n", + "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", + " \"\"\"Display the evaluation results.\"\"\"\n", + " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient='index').T\n", + " display(Markdown(\"### Summary Metrics\"))\n", + " display(metrics_df)\n", + "\n", + " display(Markdown(f\"### Row-wise Metrics\"))\n", + " display(eval_result.metrics_table)\n", + "\n", + "def display_drilldown(row: pd.Series) -> None:\n", + " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", + "\n", + " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", + "\n", + " if not (isinstance(row['predicted_trajectory'], list) and isinstance(row['reference_trajectory'], list)):\n", + " return\n", + "\n", + " for predicted_trajectory, reference_trajectory in zip(row['predicted_trajectory'], row['reference_trajectory']):\n", + " display(HTML(f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"))\n", + "\n", + "\n", + " if not (isinstance(predicted_trajectory.get('tool_input'), dict) and isinstance(reference_trajectory.get('tool_input'), dict)):\n", + " continue\n", + "\n", + "\n", + " for tool_input_key in predicted_trajectory['tool_input']:\n", + " print(\"Tool Input Key: \", tool_input_key)\n", + "\n", + " if tool_input_key in reference_trajectory['tool_input']:\n", + " print(\"Tool Values: \", predicted_trajectory['tool_input'][tool_input_key], reference_trajectory['tool_input'][tool_input_key])\n", + " else:\n", + " print(\"Tool Values: \", predicted_trajectory['tool_input'][tool_input_key], \"N/A\")\n", + " print(\"\\n\")\n", + " display(HTML(\"
\"))\n", + "\n", + "def display_dataframe_rows(\n", + " df: pd.DataFrame,\n", + " columns: list[str] | None = None,\n", + " num_rows: int = 3,\n", + " display_drilldown: bool = False\n", + ") -> None:\n", + " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", + "\n", + " if columns:\n", + " df = df[columns]\n", + "\n", + " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", + " header_style = base_style + \"font-weight: bold;\"\n", + "\n", + " for _, row in df.head(num_rows).iterrows():\n", + " for column in df.columns:\n", + " display(HTML(f\"{column.replace('_', ' ').title()}: \"))\n", + " display(HTML(f\"{row[column]}
\"))\n", + "\n", + " display(HTML(\"
\"))\n", + "\n", + " if display_drilldown and 'predicted_trajectory' in df.columns and 'reference_trajectory' in df.columns:\n", + " display_drilldown(row)\n", + "\n", + "def plot_bar_plot(\n", + " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", + ") -> None:\n", + " fig = go.Figure()\n", + " data = []\n", + "\n", + " summary_metrics = eval_result.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " data.append(\n", + " go.Bar(\n", + " x=list(summary_metrics.keys()),\n", + " y=list(summary_metrics.values()),\n", + " name=title,\n", + " )\n", + " )\n", + "\n", + " fig = go.Figure(data=data)\n", + "\n", + " # Change the bar mode\n", + " fig.update_layout(barmode=\"group\")\n", + " fig.show()\n", + "\n", + "def display_radar_plot(eval_results, title: str, metrics=None):\n", + " \"\"\"Plot the radar plot.\"\"\"\n", + " fig = go.Figure()\n", + " summary_metrics = eval_results.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " min_val = min(summary_metrics.values())\n", + " max_val = max(summary_metrics.values())\n", + "\n", + " fig.add_trace(\n", + " go.Scatterpolar(\n", + " r=list(summary_metrics.values()),\n", + " theta=list(summary_metrics.keys()),\n", + " fill=\"toself\",\n", + " name=title,\n", + " )\n", + " )\n", + " fig.update_layout(\n", + " title=title,\n", + " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", + " showlegend=True\n", + " )\n", + " fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bDaa2Mtsifmq" + }, + "source": [ + "## Build and deploy a LangChain agent using Vertex AI Reasoning Engine's prebuilt template\n", + "\n", + "Build and deploy your application using LangChain, including the Gemini model and custom tools that you define.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KHwShhpOitKp" + }, + "source": [ + "### Set tools\n", + "\n", + "To start, create some tools the agent will need to do their job. We are just going to pretend there's a database for this colab, but you would wire into your database or third party system for a real agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gA2ZKvfeislw" + }, + "outputs": [], + "source": [ + "def get_product_details(product_name: str):\n", + " \"\"\"Gathers basic details about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", + " \"usb charger\": \"A super fast and light usb charger\",\n", + " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", + " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", + " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " }\n", + " return details.get(product_name, \"Product details not found.\")\n", + "\n", + "def get_product_price(product_name: str):\n", + " \"\"\"Gathers price about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": 500,\n", + " \"usb charger\": 10,\n", + " \"shoes\": 100,\n", + " \"headphones\": 50,\n", + " \"speaker\": 80,\n", + " }\n", + " return details.get(product_name, \"Product price not found.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l4mk5XPui4Y1" + }, + "source": [ + "### Set the model\n", + "\n", + "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BaYeo6K2i-w1" + }, + "outputs": [], + "source": [ + "model = \"gemini-1.5-pro\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNlAY9cojEWz" + }, + "source": [ + "### Assemble the agent\n", + "\n", + "To create a LangChain agent using [Vertex AI Reasoning Engine](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy), use the LangchainAgent class. This class helps you quickly get an agent running with a standard template. Think of it as a shortcut for building agents – you don't have to start from scratch. The LangchainAgent handles the basic structure and initial configuration, allowing you to get right into using the agent.\n", + "\n", + "> Note the additional parameter `agent_executor_kwargs` which would allow to return tool calls made by agent so you can evaluate them.\n", + "\n", + "The Vertex AI GenAI Evaluation works directly with 'Queryable' agents (like in this case), and also lets you add your own custom functions with a specific structure (signature)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dAFdi7SujGP8" + }, + "outputs": [], + "source": [ + "local_1p_agent = reasoning_engines.LangchainAgent(\n", + " model=model,\n", + " tools=[get_product_details, get_product_price],\n", + " agent_executor_kwargs={\"return_intermediate_steps\": True}\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_HGcs6PVjRj_" + }, + "source": [ + "### Test the local agent\n", + "\n", + "Query your agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 46 + }, + "executionInfo": { + "elapsed": 1940, + "status": "ok", + "timestamp": 1734467973841, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "lGb58OJkjUs9", + "outputId": "2089190f-2428-4dcb-8eb2-5214285344d3" + }, + "outputs": [], + "source": [ + "response = local_1p_agent.query(input=\"Get product details for shoes\")\n", + "display(Markdown(response['output']))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 46 + }, + "executionInfo": { + "elapsed": 1897, + "status": "ok", + "timestamp": 1734468011919, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "INqf60zPWP6L", + "outputId": "71b96760-e68a-47c4-80ad-1536253a0673" + }, + "outputs": [], + "source": [ + "response = local_1p_agent.query(input=\"Get product price for shoes\")\n", + "display(Markdown(response['output']))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dP5g16W1rzMI" + }, + "source": [ + "### Deploy the local agent to Vertex AI Reasoning Engine\n", + "\n", + "To deploy the local agent on Vertex AI Reasoning Engine, you can use the `create` method by passing the agent and some specify dependencies (`requirements` for external PyPI packages and `extra_packages` for local packages ).\n", + "\n", + "Look at [Deploy the application](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy#create_a_reasoningengine_instance) documentation page to learn more. \n", + "\n", + "> The agent deployment on Vertex AI Reasoning Engine would require ~ 10 mins." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 231458, + "status": "ok", + "timestamp": 1734468270309, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "GPNpD676r6T2", + "outputId": "02f04032-a8d6-4f04-936f-22f9b9110875" + }, + "outputs": [], + "source": [ + "remote_1p_agent = reasoning_engines.ReasoningEngine.create(\n", + " local_1p_agent,\n", + " requirements=[\n", + " \"google-cloud-aiplatform[langchain,reasoningengine]\",\n", + " \"langchain_google_vertexai\",\n", + " \"cloudpickle==3.0.0\",\n", + " \"pydantic==2.7.4\",\n", + " \"requests\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GjZMd82vHRh3" + }, + "source": [ + "### Test the remote agent\n", + "\n", + "Query your remote agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 46 + }, + "executionInfo": { + "elapsed": 2553, + "status": "ok", + "timestamp": 1734468272840, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "KSCznbhbHRh3", + "outputId": "9a8cb4bc-a266-456a-f4ea-2c6ccecadad8" + }, + "outputs": [], + "source": [ + "response = remote_1p_agent.query(input=\"Get product details for shoes\")\n", + "display(Markdown(response['output']))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aOGPePsorpUl" + }, + "source": [ + "## Evaluating an agent with Vertex AI Gen AI Evaluation\n", + "\n", + "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", + "\n", + "Monitoring focuses on how well your agent is performing specific tasks:\n", + "\n", + "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", + "\n", + "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", + "\n", + "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", + "\n", + "Observability is about understanding the overall health of the agent:\n", + "\n", + "* **Latency**: How long does it take the agent to respond?\n", + "\n", + "* **Failure Rate**: How often does the agent fail to produce a response?\n", + "\n", + "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e43229f3ad4f" + }, + "source": [ + "### Prepare Agent Evaluation dataset\n", + "\n", + "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", + "\n", + "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", + "\n", + "> Optionally, you can provide both generated responses and predicted trajectory (**bring-your-own-dataset scenario**).\n", + "\n", + "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fFf8uTdUiDt3" + }, + "outputs": [], + "source": [ + "eval_data = {\n", + " 'prompt': [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\"\n", + " ],\n", + " 'reference_trajectory': [\n", + " [{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}],\n", + " [\n", + " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}},\n", + " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}\n", + " ],\n", + " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}],\n", + " [\n", + " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'shoes'}},\n", + " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'shoes'}}\n", + " ],\n", + " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'speaker'}}]\n", + " ]\n", + "}\n", + "\n", + "eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PQEI1EcfvFHb" + }, + "source": [ + "Print some samples from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 273 + }, + "executionInfo": { + "elapsed": 435, + "status": "ok", + "timestamp": 1734468312746, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "EjsonqWWvIvE", + "outputId": "b989c414-d83a-42ca-f7d9-508e0aaaca9f" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(eval_sample_dataset, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m4CvBuf1afHG" + }, + "source": [ + "### Single tool usage evaluation\n", + "\n", + "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_rS5GGKHd5bx" + }, + "source": [ + "#### Set single tool usage metrics\n", + "\n", + "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", + "\n", + "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xixvq8dwd5by" + }, + "outputs": [], + "source": [ + "single_tool_usage_metrics = [\n", + " TrajectorySingleToolUse(tool_name='get_product_price')\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ktKZoT2Qd5by" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using the remote agent and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QaMf9dqzySE6" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 613 + }, + "executionInfo": { + "elapsed": 18114, + "status": "ok", + "timestamp": 1734468369661, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "SRv43fDcd5by", + "outputId": "cb875ccf-1ea9-4768-a606-d0e841648850" + }, + "outputs": [], + "source": [ + "single_tool_call_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=single_tool_usage_metrics,\n", + " experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(runnable=remote_1p_agent,\n", + " experiment_run_name=EXPERIMENT_RUN)\n", + "\n", + "display_eval_report(single_tool_call_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6o5BjSTFKVMS" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Use some helper functions to visualize a sample of evaluation result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 829 + }, + "executionInfo": { + "elapsed": 322, + "status": "ok", + "timestamp": 1734468372535, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "1Jopzw83k14w", + "outputId": "3f181e69-b384-4098-a09c-e072473a2dcc" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JlujdJpu5Kn6" + }, + "source": [ + "### Trajectory Evaluation\n", + "\n", + "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8s-nHdDJneHM" + }, + "source": [ + "#### Set trajectory metrics\n", + "\n", + "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", + "\n", + "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", + "\n", + "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", + "\n", + "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", + "\n", + "* `trajectory_precision`: proportion of predicted actions present in reference\n", + "\n", + "* `trajectory_recall`: proportion of reference actions present in predicted. \n", + "\n", + "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c32WIS95neHN" + }, + "outputs": [], + "source": [ + "trajectory_metrics = [\n", + " \"trajectory_exact_match\", \"trajectory_in_order_match\", \"trajectory_any_order_match\", \"trajectory_precision\", \"trajectory_recall\"\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF3jhTH3neHN" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 960 + }, + "executionInfo": { + "elapsed": 35910, + "status": "ok", + "timestamp": 1734468421299, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "vOdS7TJUneHN", + "outputId": "eef7902b-6674-4776-dfe5-7117154cde8d" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"trajectory-{get_id()}\"\n", + "\n", + "trajectory_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=trajectory_metrics,\n", + " experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=remote_1p_agent)\n", + "\n", + "display_eval_report(trajectory_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DBiUI3LyLBtj" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Print and visualize a sample of evaluation results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "executionInfo": { + "elapsed": 393, + "status": "ok", + "timestamp": 1734468421689, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "sLVRdN5llA0h", + "outputId": "65586ef4-5845-477e-8c42-ae3277b60a42" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 542 + }, + "executionInfo": { + "elapsed": 652, + "status": "ok", + "timestamp": 1734468540382, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "PrxM5sMZYXHP", + "outputId": "c343985c-2eac-4fbc-ca75-3fb4ddbeca2b" + }, + "outputs": [], + "source": [ + "plot_bar_plot(trajectory_eval_result, title=\"Trajectory Metrics\", metrics=[f'{metric}/mean' for metric in trajectory_metrics])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T8TipU2akHEd" + }, + "source": [ + "### Evaluate final response\n", + "\n", + "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DeK-py7ykkDN" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", + "\n", + "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cyGHGgeVklvz" + }, + "outputs": [], + "source": [ + "response_metrics = [\n", + " 'safety', 'coherence'\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DaBJWcg1kn55" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 717 + }, + "executionInfo": { + "elapsed": 25771, + "status": "ok", + "timestamp": 1734468573962, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "wRb2EC_hknSD", + "outputId": "ab6becbd-0dc3-4bf7-9fbb-9a03a2aa204a" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"response-{get_id()}\"\n", + "\n", + "response_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=response_metrics,\n", + " experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "response_eval_result = response_eval_task.evaluate(runnable=remote_1p_agent)\n", + "\n", + "display_eval_report(response_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JtewTwiwg9qH" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "executionInfo": { + "elapsed": 397, + "status": "ok", + "timestamp": 1734468577642, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "ZODTRuq2lF75", + "outputId": "b4e39f6a-e14f-48ef-eb8f-1039316abbee" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ntRBK3Te6PEc" + }, + "source": [ + "### Evaluate generated response conditioned by tool choosing.\n", + "\n", + "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", + "\n", + "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4bENwFcd6prX" + }, + "source": [ + "#### Define a custom metric\n", + "\n", + "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", + "\n", + "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "txGEHcg76riI" + }, + "outputs": [], + "source": [ + "criteria = {\n", + " \"Follows trajectory\": (\n", + " \"Evaluate whether the agent's response logically follows from the \"\n", + " \"sequence of actions it took. Consider these sub-points:\\n\"\n", + " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", + " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", + " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", + " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", + " )\n", + " }\n", + "\n", + "pointwise_rating_rubric = {\n", + " \"1\": \"Follows trajectory\",\n", + " \"0\": \"Does not follow trajectory\",\n", + "}\n", + "\n", + "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", + " criteria=criteria,\n", + " rating_rubric=pointwise_rating_rubric,\n", + " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8MJqXu0kikxd" + }, + "source": [ + "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 235, + "status": "ok", + "timestamp": 1734468591957, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "5EL7iEDMikNQ", + "outputId": "dbb1f5f1-e0e3-4d77-a29b-93bb6e6b6c54" + }, + "outputs": [], + "source": [ + "print(response_follows_trajectory_prompt_template.prompt_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e1djVp7Fi4Yy" + }, + "source": [ + "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nx1xbZD87iMj" + }, + "outputs": [], + "source": [ + "response_follows_trajectory_metric = PointwiseMetric(\n", + " metric=\"response_follows_trajectory\",\n", + " metric_prompt_template=response_follows_trajectory_prompt_template,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1pmxLwTe7Ywv" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "Set new generated response evaluation metrics by including the custom metric.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wrsbVFDd7Ywv" + }, + "outputs": [], + "source": [ + "response_tool_metrics = [\n", + " \"trajectory_exact_match\", \"trajectory_in_order_match\", \"safety\", response_follows_trajectory_metric\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lo-Sza807Ywv" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 943 + }, + "executionInfo": { + "elapsed": 33115, + "status": "ok", + "timestamp": 1734468628632, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "_dkb4gSn7Ywv", + "outputId": "2f64adb5-e4a0-4b3b-9841-479e3886ae7f" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"response-over-tools-{get_id()}\"\n", + "\n", + "response_eval_tool_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "response_eval_tool_result = response_eval_tool_task.evaluate(runnable=remote_1p_agent)\n", + "\n", + "display_eval_report(response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AtOfIFi2j88g" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "executionInfo": { + "elapsed": 24, + "status": "ok", + "timestamp": 1734468628632, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "GH2YvXgLlLH7", + "outputId": "0a6023db-ecfa-4cc3-d03b-d9f021312a17" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4nuUDP3a2eTB" + }, + "source": [ + "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", + "\n", + "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pNh3-NDuZGDl" + }, + "source": [ + "### Bring your own evaluation dataset\n", + "\n", + "Define the evaluation dataset with the predicted trajectory and the generated response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y9hBgsg324Ej" + }, + "outputs": [], + "source": [ + "byod_eval_data = {\n", + " 'prompt': [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\"\n", + " ],\n", + " 'reference_trajectory': [\n", + " [{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}],\n", + " [\n", + " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}},\n", + " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}\n", + " ],\n", + " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}],\n", + " [\n", + " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'shoes'}},\n", + " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'shoes'}}\n", + " ],\n", + " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'speaker'}}]\n", + " ],\n", + " 'generated_trajectory': [\n", + " [{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}],\n", + " [\n", + " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}},\n", + " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}\n", + " ],\n", + " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}],\n", + " [\n", + " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'shoes'}},\n", + " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'shoes'}}\n", + " ],\n", + " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'speaker'}}]\n", + " ],\n", + " 'response': [500, 50, 'A super fast and light usb charger', 100, 'A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.']\n", + "\n", + "}\n", + "\n", + "byod_eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oEYmU2eJ7q-1" + }, + "source": [ + "### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 960 + }, + "executionInfo": { + "elapsed": 32637, + "status": "ok", + "timestamp": 1734468769433, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "wBD-4wpB7q-3", + "outputId": "5ff918de-eadc-40e5-b983-75a668fd1db0" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", + "\n", + "byod_response_eval_tool_task = EvalTask(\n", + " dataset=byod_eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(runnable=remote_1p_agent,\n", + " experiment_run_name=EXPERIMENT_RUN_NAME)\n", + "\n", + "display_eval_report(byod_response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9eU3LG6r7q-3" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "executionInfo": { + "elapsed": 481, + "status": "ok", + "timestamp": 1734468772628, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "pQFzmd2I7q-3", + "outputId": "cfe0eee2-1705-4b22-d48a-dbca033ada51" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 542 + }, + "executionInfo": { + "elapsed": 224, + "status": "ok", + "timestamp": 1734468783403, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "0FEbvEOkZS8f", + "outputId": "15914a55-db88-4626-e837-bc371cfb43d8" + }, + "outputs": [], + "source": [ + "display_radar_plot(byod_response_eval_tool_result, title=\"Response Metrics\", metrics=[f'{metric}/mean' for metric in response_tool_metrics])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2a4e033321ad" + }, + "source": [ + "## Cleaning up\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 5561, + "status": "ok", + "timestamp": 1734337779157, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "Ox2I3UfRlTOd", + "outputId": "f2693115-5c89-4710-c823-a80546711732" + }, + "outputs": [], + "source": [ + "delete_experiment=True\n", + "delete_remote_agent=True\n", + "\n", + "if delete_experiment:\n", + " try:\n", + " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", + " experiment.delete(delete_backing_tensorboard_runs=True)\n", + " except Exception as e:\n", + " print(e)\n", + "\n", + "if delete_remote_agent:\n", + " try:\n", + " remote_1p_agent.delete()\n", + " except Exception as e:\n", + " print(e)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [], + "toc_visible": true + }, + "environment": { + "kernel": "python3", + "name": "common-cpu.m126", + "type": "gcloud", + "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/base-cpu:m126" + }, + "kernelspec": { + "display_name": "Python 3 (Local)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb b/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb new file mode 100644 index 0000000000..cf7bd60348 --- /dev/null +++ b/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb @@ -0,0 +1,9322 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Evaluate an LangGraph agent on Vertex AI Reasoning Engine (Customized template)\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Google
Open in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Open in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "Share to:\n", + "\n", + "\n", + " \"LinkedIn\n", + "\n", + "\n", + "\n", + " \"Bluesky\n", + "\n", + "\n", + "\n", + " \"X\n", + "\n", + "\n", + "\n", + " \"Reddit\n", + "\n", + "\n", + "\n", + " \"Facebook\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "84f0f73a0f76" + }, + "source": [ + "| | |\n", + "|-|-|\n", + "| Authors | [Naveksha Sood](https://github.com/navekshasood) [Ivan Nardini](https://github.com/inardini) |" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", + "\n", + "This tutorial shows how to evaluate an LangGraph agent with customized template on Vertex AI Reasoning Engine using Vertex AI Gen AI Evaluation.\n", + "\n", + "The tutorial uses the following Google Cloud services and resources:\n", + "\n", + "* Vertex AI Gen AI Evaluation\n", + "* Vertex AI Reasoning Engine\n", + "\n", + "The steps performed include:\n", + "\n", + "* Build and deploy an agent using LangGraph on Vertex AI Reasoning Engine\n", + "* Prepare Agent Evaluation dataset\n", + "* Single tool usage evaluation\n", + "* Trajectory evaluation\n", + "* Response evaluation\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "61RBz8LLbxCR" + }, + "source": [ + "## Get started" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "No17Cw5hgx12" + }, + "source": [ + "### Install Vertex AI SDK and other required packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 57379, + "status": "ok", + "timestamp": 1734507469619, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "tFy3H3aPgx12", + "outputId": "3b6356f6-3831-4366-e029-2cb3b34b4d2f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.0/42.0 kB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.3/67.3 kB\u001b[0m \u001b[31m4.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m48.5/48.5 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m192.0/192.0 kB\u001b[0m \u001b[31m7.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m468.0/468.0 kB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m131.8/131.8 kB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m628.3/628.3 kB\u001b[0m \u001b[31m15.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m22.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m147.8/147.8 kB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m211.4/211.4 kB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.1/53.1 kB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m29.9/29.9 MB\u001b[0m \u001b[31m31.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m33.5/33.5 MB\u001b[0m \u001b[31m14.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.5/6.5 MB\u001b[0m \u001b[31m72.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.8/55.8 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.2/59.2 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m83.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m319.7/319.7 kB\u001b[0m \u001b[31m19.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.6/57.6 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m756.0/756.0 kB\u001b[0m \u001b[31m38.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.7/9.7 MB\u001b[0m \u001b[31m76.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.0/15.0 MB\u001b[0m \u001b[31m55.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m233.5/233.5 kB\u001b[0m \u001b[31m13.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m278.6/278.6 kB\u001b[0m \u001b[31m16.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m249.9/249.9 kB\u001b[0m \u001b[31m14.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m94.8/94.8 kB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m131.6/131.6 kB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m327.6/327.6 kB\u001b[0m \u001b[31m17.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.9/1.9 MB\u001b[0m \u001b[31m63.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.3/44.3 kB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.5/2.5 MB\u001b[0m \u001b[31m62.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.7/50.7 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m311.8/311.8 kB\u001b[0m \u001b[31m18.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m83.2/83.2 kB\u001b[0m \u001b[31m5.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m93.2/93.2 kB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.3/13.3 MB\u001b[0m \u001b[31m51.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.8/54.8 kB\u001b[0m \u001b[31m3.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m298.0/298.0 kB\u001b[0m \u001b[31m16.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.8/2.8 MB\u001b[0m \u001b[31m56.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m71.1/71.1 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m41.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.0/3.0 MB\u001b[0m \u001b[31m65.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m481.7/481.7 kB\u001b[0m \u001b[31m23.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.3/62.3 kB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m56.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m442.1/442.1 kB\u001b[0m \u001b[31m21.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m42.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m209.0/209.0 kB\u001b[0m \u001b[31m13.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m267.2/267.2 kB\u001b[0m \u001b[31m14.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m73.2/73.2 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.8/3.8 MB\u001b[0m \u001b[31m57.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m443.8/443.8 kB\u001b[0m \u001b[31m26.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.6/78.6 kB\u001b[0m \u001b[31m5.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m49.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m6.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.5/49.5 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.5/57.5 kB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for docx2txt (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Building wheel for pypika (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[33m WARNING: The script uvicorn is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script pytube is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script dotenv is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script pypdfium2 is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script nodeenv is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script mako-render is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script json_repair is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script humanfriendly is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script fastavro is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script watchfiles is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The scripts pyright, pyright-langserver, pyright-python and pyright-python-langserver are installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script gptcache_server is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script coloredlogs is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script pyproject-build is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script alembic is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script onnxruntime_test is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script langsmith is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script fastapi is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script pdfplumber is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script litellm is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script instructor is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script chroma is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script ec is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script crewai is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "tensorflow 2.17.1 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 5.29.1 which is incompatible.\n", + "tensorflow-metadata 1.13.1 requires protobuf<5,>=3.20.3, but you have protobuf 5.29.1 which is incompatible.\n", + "transformers 4.47.0 requires tokenizers<0.22,>=0.21, but you have tokenizers 0.20.3 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation, langchain, reasoningengine]\" \\\n", + " \"langchain_google_vertexai\" \\\n", + " \"langgraph\" \\\n", + " \"cloudpickle==3.0.0\" \\\n", + " \"pydantic==2.7.4\" \\\n", + " \"requests\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R5Xep4W9lq-Z" + }, + "source": [ + "### Restart runtime\n", + "\n", + "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", + "\n", + "The restart might take a minute or longer. After it's restarted, continue to the next step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 240, + "status": "ok", + "timestamp": 1734507927111, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "XRvKdaPDTznN", + "outputId": "bd462f4a-0cfc-429f-d955-e5df75e94773" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'status': 'ok', 'restart': True}" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import IPython\n", + "\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SbmM4z7FOBpM" + }, + "source": [ + "
\n", + "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dmWOrTJ3gx13" + }, + "source": [ + "### Authenticate your notebook environment (Colab only)\n", + "\n", + "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NyKGtVQjgx13" + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " from google.colab import auth\n", + "\n", + " auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF4l8DTdWgPY" + }, + "source": [ + "### Set Google Cloud project information and initialize Vertex AI SDK\n", + "\n", + "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 112 + }, + "executionInfo": { + "elapsed": 7562, + "status": "ok", + "timestamp": 1734508011536, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "Nqwi-5ufWp_B", + "outputId": "d9d462c6-03cd-4c16-f5e7-55ad485e7dc2" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating gs://evaluate_agents/...\n", + "ServiceException: 409 A Cloud Storage bucket named 'evaluate_agents' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Use the environment variable if the user doesn't provide Project ID.\n", + "import os\n", + "\n", + "import vertexai\n", + "\n", + "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", + "\n", + "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", + " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", + "\n", + "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", + "\n", + "BUCKET_NAME = \"[your-bucket-name]\" # @param {type: \"string\", placeholder: \"[your-bucket-name]\", isTemplate: true}\n", + "\n", + "if not BUCKET_NAME or BUCKET_NAME == \"[your-bucket-name]\":\n", + " BUCKET_NAME = f\"{PROJECT_ID}-bucket\"\n", + "\n", + "BUCKET_URI = f\"gs://evaluate_agents\"\n", + "\n", + "! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI\n", + "\n", + "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", + "\n", + "vertexai.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI, experiment=EXPERIMENT_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5303c05f7aa6" + }, + "source": [ + "## Import libraries\n", + "\n", + "Import tutorial libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6fc324893334" + }, + "outputs": [], + "source": [ + "# General\n", + "import random\n", + "import string\n", + "import json\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "from IPython.display import display, Markdown, HTML\n", + "from typing import Callable, Sequence, TypedDict, Annotated, Literal\n", + "\n", + "# Build agent\n", + "from crewai import Agent, Task, Crew, Process\n", + "from crewai_tools import tool\n", + "from crewai.flow.flow import Flow, listen, start\n", + "from vertexai.preview import reasoning_engines\n", + "\n", + "# Evaluate agent\n", + "from google.cloud import aiplatform\n", + "from vertexai.preview.evaluation import EvalTask\n", + "from vertexai.preview.evaluation.metrics import TrajectorySingleToolUse, PointwiseMetricPromptTemplate, PointwiseMetric" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MVnBDX54gz7j" + }, + "source": [ + "## Define helper functions\n", + "\n", + "Initiate a set of helper functions to print tutorial results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uSgWjMD_g1_v" + }, + "outputs": [], + "source": [ + "def get_id(length: int = 8) -> str:\n", + " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", + " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", + "\n", + "def parse_crewai_output_to_dictionary(crew, crew_output):\n", + " \"\"\"\n", + " Parse CrewAI output into a structured dictionary format.\n", + " \"\"\"\n", + " final_output = {\n", + " 'response': str(crew_output),\n", + " 'predicted_trajectory': []\n", + " }\n", + "\n", + " try:\n", + " # Access tools_results directly from each agent\n", + " for agent in crew.agents:\n", + " if hasattr(agent, 'tools_results'):\n", + " for tool_result in agent.tools_results:\n", + " tool_info = {\n", + " 'tool_name': tool_result.get('tool_name', ''),\n", + " 'tool_input': tool_result.get('tool_args', {})\n", + " }\n", + " final_output['predicted_trajectory'].append(tool_info)\n", + "\n", + " except Exception as e:\n", + " final_output['error'] = f\"Error parsing tools results: {str(e)}\"\n", + "\n", + " return final_output\n", + "\n", + "def format_output_as_markdown(output: dict) -> str:\n", + " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", + " markdown = \"### AI Response\\n\"\n", + " markdown += f\"{output['response']}\\n\\n\"\n", + "\n", + " if output['predicted_trajectory']:\n", + " markdown += \"### Function Calls\\n\"\n", + " for call in output['predicted_trajectory']:\n", + " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", + " markdown += \" - **Arguments**:\\n\"\n", + " for key, value in call['tool_input'].items():\n", + " markdown += f\" - `{key}`: `{value}`\\n\"\n", + "\n", + " return markdown\n", + "\n", + "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", + " \"\"\"Display the evaluation results.\"\"\"\n", + " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient='index').T\n", + " display(Markdown(\"### Summary Metrics\"))\n", + " display(metrics_df)\n", + "\n", + " display(Markdown(f\"### Row-wise Metrics\"))\n", + " display(eval_result.metrics_table)\n", + "\n", + "def display_drilldown(row: pd.Series) -> None:\n", + " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", + "\n", + " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", + "\n", + " if not (isinstance(row['predicted_trajectory'], list) and isinstance(row['reference_trajectory'], list)):\n", + " return\n", + "\n", + " for predicted_trajectory, reference_trajectory in zip(row['predicted_trajectory'], row['reference_trajectory']):\n", + " display(HTML(f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"))\n", + "\n", + "\n", + " if not (isinstance(predicted_trajectory.get('tool_input'), dict) and isinstance(reference_trajectory.get('tool_input'), dict)):\n", + " continue\n", + "\n", + "\n", + " for tool_input_key in predicted_trajectory['tool_input']:\n", + " print(\"Tool Input Key: \", tool_input_key)\n", + "\n", + " if tool_input_key in reference_trajectory['tool_input']:\n", + " print(\"Tool Values: \", predicted_trajectory['tool_input'][tool_input_key], reference_trajectory['tool_input'][tool_input_key])\n", + " else:\n", + " print(\"Tool Values: \", predicted_trajectory['tool_input'][tool_input_key], \"N/A\")\n", + " print(\"\\n\")\n", + " display(HTML(\"
\"))\n", + "\n", + "def display_dataframe_rows(\n", + " df: pd.DataFrame,\n", + " columns: list[str] | None = None,\n", + " num_rows: int = 3,\n", + " display_drilldown: bool = False\n", + ") -> None:\n", + " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", + "\n", + " if columns:\n", + " df = df[columns]\n", + "\n", + " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", + " header_style = base_style + \"font-weight: bold;\"\n", + "\n", + " for _, row in df.head(num_rows).iterrows():\n", + " for column in df.columns:\n", + " display(HTML(f\"{column.replace('_', ' ').title()}: \"))\n", + " display(HTML(f\"{row[column]}
\"))\n", + "\n", + " display(HTML(\"
\"))\n", + "\n", + " if display_drilldown and 'predicted_trajectory' in df.columns and 'reference_trajectory' in df.columns:\n", + " display_drilldown(row)\n", + "\n", + "def plot_bar_plot(\n", + " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", + ") -> None:\n", + " fig = go.Figure()\n", + " data = []\n", + "\n", + " summary_metrics = eval_result.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " data.append(\n", + " go.Bar(\n", + " x=list(summary_metrics.keys()),\n", + " y=list(summary_metrics.values()),\n", + " name=title,\n", + " )\n", + " )\n", + "\n", + " fig = go.Figure(data=data)\n", + "\n", + " # Change the bar mode\n", + " fig.update_layout(barmode=\"group\")\n", + " fig.show()\n", + "\n", + "def display_radar_plot(eval_results, title: str, metrics=None):\n", + " \"\"\"Plot the radar plot.\"\"\"\n", + " fig = go.Figure()\n", + " summary_metrics = eval_results.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " min_val = min(summary_metrics.values())\n", + " max_val = max(summary_metrics.values())\n", + "\n", + " fig.add_trace(\n", + " go.Scatterpolar(\n", + " r=list(summary_metrics.values()),\n", + " theta=list(summary_metrics.keys()),\n", + " fill=\"toself\",\n", + " name=title,\n", + " )\n", + " )\n", + " fig.update_layout(\n", + " title=title,\n", + " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", + " showlegend=True\n", + " )\n", + " fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bDaa2Mtsifmq" + }, + "source": [ + "## Build an agent using Vertex AI Reasoning Engine's customized template\n", + "\n", + "Build and deploy your application using LangGraph, including the Gemini model and custom tools that you define.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KHwShhpOitKp" + }, + "source": [ + "### Set tools\n", + "\n", + "To start, set the tools that a customer support agent needs to do their job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gA2ZKvfeislw" + }, + "outputs": [], + "source": [ + "@tool\n", + "def get_product_details(product_name: str):\n", + " \"\"\"Gathers basic details about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", + " \"usb charger\": \"A super fast and light usb charger\",\n", + " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", + " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", + " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " }\n", + " return details.get(product_name, \"Product details not found.\")\n", + "\n", + "\n", + "@tool\n", + "def get_product_price(product_name: str):\n", + " \"\"\"Gathers price about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": 500,\n", + " \"usb charger\": 10,\n", + " \"shoes\": 100,\n", + " \"headphones\": 50,\n", + " \"speaker\": 80,\n", + " }\n", + " return details.get(product_name, \"Product price not found.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "be70714d9fae" + }, + "source": [ + "### Define router\n", + "\n", + "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "516b5108d327" + }, + "outputs": [], + "source": [ + "def router(state: list[BaseMessage]) -> Literal[\"get_product_details\", \"get_product_price\", \"__end__\"]:\n", + " \"\"\"Initiates product details or price retrieval if the user asks for a product.\"\"\"\n", + " # Get the tool_calls from the last message in the conversation history.\n", + " tool_calls = state[-1].tool_calls\n", + "\n", + " # If there are any tool_calls\n", + " if tool_calls:\n", + " # Check the function name in the first tool call\n", + " function_name = tool_calls[0].get('name')\n", + " if function_name == \"get_product_price\":\n", + " return \"get_product_price\"\n", + " else:\n", + " return \"get_product_details\"\n", + " else:\n", + " # End the conversation flow.\n", + " return \"__end__\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FHjhBVx2cHWb" + }, + "source": [ + "### Set the model\n", + "\n", + "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iCx9hbpccHWc" + }, + "outputs": [], + "source": [ + "model = \"gemini-1.5-pro\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNlAY9cojEWz" + }, + "source": [ + "### Assemble the agent\n", + "\n", + "To create a LangGraph agent using [Vertex AI Reasoning Engine](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy), use the [customized template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize).\n", + "\n", + "This class helps you quickly get an agent using any framework running Vertex AI Reasoning Engine.\n", + "\n", + "To learn more about the template, check out [Customize an application template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize) documentation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dAFdi7SujGP8" + }, + "outputs": [], + "source": [ + "class LangGraphApp:\n", + " def __init__(self, project: str, location: str, model: str = model) -> None:\n", + " self.project_id = project\n", + " self.location = location\n", + " self.model = model\n", + "\n", + " # The set_up method is used to define application initialization logic\n", + " def set_up(self) -> None:\n", + " model = ChatVertexAI(model=self.model)\n", + " builder = MessageGraph()\n", + "\n", + " model_with_tools = model.bind_tools([get_product_details, get_product_price])\n", + " builder.add_node(\"tools\", model_with_tools)\n", + "\n", + " tool_node = ToolNode([get_product_details, get_product_price])\n", + " builder.add_node(\"get_product_details\", tool_node)\n", + " builder.add_node(\"get_product_price\", tool_node)\n", + " builder.add_edge(\"get_product_details\", END)\n", + " builder.add_edge(\"get_product_price\", END)\n", + "\n", + " builder.set_entry_point(\"tools\")\n", + " builder.add_conditional_edges(\"tools\", router)\n", + " self.app = builder.compile()\n", + "\n", + " # The query method will be used to send inputs to the agent\n", + " def query(self, input: str):\n", + " \"\"\"Query the application.\"\"\"\n", + " chat_history = langchain_load_dump.dumpd(self.app.invoke(HumanMessage(input)))\n", + " return chat_history\n", + " # return {'output': parse_messages_to_output_dictionary(chat_history)}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_HGcs6PVjRj_" + }, + "source": [ + "### Test the local agent\n", + "\n", + "Query your agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1dXLLgBudu_L" + }, + "outputs": [], + "source": [ + "local_custom_agent = LangGraphApp(project=PROJECT_ID, location=LOCATION)\n", + "local_custom_agent.set_up()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 190 + }, + "executionInfo": { + "elapsed": 1688, + "status": "ok", + "timestamp": 1734506144463, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "PgkOhPmN3aCZ", + "outputId": "dfba408f-6510-4eb8-acd2-e845e360a6ef" + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "### AI Response\n", + "High-performance running shoes designed for comfort, support, and speed.\n", + "\n", + "### Function Calls\n", + "- **Function**: `get_product_details`\n", + " - **Arguments**:\n", + " - `product_name`: `shoes`\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "response = local_custom_agent.query(input=\"Get product details for shoes\")\n", + "display(Markdown(format_output_as_markdown(parse_messages_to_output_dictionary(response))))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 190 + }, + "executionInfo": { + "elapsed": 977, + "status": "ok", + "timestamp": 1734506145439, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "lGb58OJkjUs9", + "outputId": "ca9ea3b9-7141-43e0-b169-fb30bd028509" + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "### AI Response\n", + "100\n", + "\n", + "### Function Calls\n", + "- **Function**: `get_product_price`\n", + " - **Arguments**:\n", + " - `product_name`: `shoes`\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "response = local_custom_agent.query(input=\"Get product price for shoes\")\n", + "display(Markdown(format_output_as_markdown(parse_messages_to_output_dictionary(response))))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2pSItXD5e4QD" + }, + "source": [ + "### Deploy the local agent to Vertex AI Reasoning Engine\n", + "\n", + "To deploy the local agent on Vertex AI Reasoning Engine, you can use the `create` method by passing the agent and some specify dependencies (`requirements` for external PyPI packages and `extra_packages` for local packages ).\n", + "\n", + "Look at [Deploy the application](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy#create_a_reasoningengine_instance) documentation page to learn more. \n", + "\n", + "> The agent deployment on Vertex AI Reasoning Engine would require ~ 10 mins." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 239473, + "status": "ok", + "timestamp": 1734472951301, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "3HLz_a1We4QE", + "outputId": "e95d4768-a790-4910-c172-94f90bb0a8bd" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:vertexai.reasoning_engines._reasoning_engines:Using bucket evaluate_agents\n", + "INFO:vertexai.reasoning_engines._reasoning_engines:Writing to gs://evaluate_agents/reasoning_engine/reasoning_engine.pkl\n", + "INFO:vertexai.reasoning_engines._reasoning_engines:Writing to gs://evaluate_agents/reasoning_engine/requirements.txt\n", + "INFO:vertexai.reasoning_engines._reasoning_engines:Creating in-memory tarfile of extra_packages\n", + "INFO:vertexai.reasoning_engines._reasoning_engines:Writing to gs://evaluate_agents/reasoning_engine/dependencies.tar.gz\n", + "INFO:vertexai.reasoning_engines._reasoning_engines:Creating ReasoningEngine\n", + "INFO:vertexai.reasoning_engines._reasoning_engines:Create ReasoningEngine backing LRO: projects/801452371447/locations/us-central1/reasoningEngines/1480048204102762496/operations/5878089664325222400\n", + "INFO:vertexai.reasoning_engines._reasoning_engines:ReasoningEngine created. Resource name: projects/801452371447/locations/us-central1/reasoningEngines/1480048204102762496\n", + "INFO:vertexai.reasoning_engines._reasoning_engines:To use this ReasoningEngine in another session:\n", + "INFO:vertexai.reasoning_engines._reasoning_engines:reasoning_engine = vertexai.preview.reasoning_engines.ReasoningEngine('projects/801452371447/locations/us-central1/reasoningEngines/1480048204102762496')\n" + ] + } + ], + "source": [ + "local_custom_agent = LangGraphApp(project=PROJECT_ID, location=LOCATION)\n", + "\n", + "remote_custom_agent = reasoning_engines.ReasoningEngine.create(\n", + " local_custom_agent,\n", + " requirements=[\n", + " \"google-cloud-aiplatform[langchain,reasoningengine]\",\n", + " \"langchain_google_vertexai\",\n", + " \"langgraph\",\n", + " \"cloudpickle==3.0.0\",\n", + " \"pydantic==2.7.4\",\n", + " \"requests\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nu4RO1P9e4QE" + }, + "source": [ + "### Test the remote agent\n", + "\n", + "Query your remote agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 190 + }, + "executionInfo": { + "elapsed": 1834, + "status": "ok", + "timestamp": 1734506153310, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "sqBtzYyce4QE", + "outputId": "16f3296b-9dd4-404a-8453-00fd799118ff" + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "### AI Response\n", + "High-performance running shoes designed for comfort, support, and speed.\n", + "\n", + "### Function Calls\n", + "- **Function**: `get_product_details`\n", + " - **Arguments**:\n", + " - `product_name`: `shoes`\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "response = remote_custom_agent.query(input=\"Get product details for shoes\")\n", + "display(Markdown(format_output_as_markdown(parse_messages_to_output_dictionary(response))))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aOGPePsorpUl" + }, + "source": [ + "## Evaluating a custom agent with Vertex AI Gen AI Evaluation\n", + "\n", + "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", + "\n", + "Monitoring focuses on how well your agent is performing specific tasks:\n", + "\n", + "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", + "\n", + "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", + "\n", + "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", + "\n", + "Observability is about understanding the overall health of the agent:\n", + "\n", + "* **Latency**: How long does it take the agent to respond?\n", + "\n", + "* **Failure Rate**: How often does the agent fail to produce a response?\n", + "\n", + "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e43229f3ad4f" + }, + "source": [ + "### Prepare Agent Evaluation dataset\n", + "\n", + "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", + "\n", + "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", + "\n", + "\n", + "> Optionally, you can provide both generated responses and predicted trajectory (**bring-your-own-dataset scenario**).\n", + "\n", + "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fFf8uTdUiDt3" + }, + "outputs": [], + "source": [ + "eval_data = {\n", + " 'prompt': [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\"\n", + " ],\n", + " 'reference_trajectory': [\n", + " [{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}],\n", + " [\n", + " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}},\n", + " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}\n", + " ],\n", + " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}],\n", + " [\n", + " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'shoes'}},\n", + " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'shoes'}}\n", + " ],\n", + " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'speaker'}}]\n", + " ]\n", + "}\n", + "\n", + "eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PQEI1EcfvFHb" + }, + "source": [ + "Print some samples from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 273 + }, + "executionInfo": { + "elapsed": 388, + "status": "ok", + "timestamp": 1734506157571, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "EjsonqWWvIvE", + "outputId": "0dbf8eec-171c-4fa2-943f-6e4e936a0d94" + }, + "outputs": [ + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get price for smartphone
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get product details and price for headphones
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get details for usb charger
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display_dataframe_rows(eval_sample_dataset, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "htCrOS9fRVi8" + }, + "source": [ + "### Prepare an Agent function\n", + "\n", + "In this scenario with a custom agent, you need an agent function to parse the agent output and pass it to Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GdO56MIDRZri" + }, + "outputs": [], + "source": [ + "def agent_parsed_response(input: str) -> dict:\n", + " \"\"\"Parse the agent output and pass it to Vertex AI Gen AI Evaluation.\"\"\"\n", + "\n", + " result = remote_custom_agent.query(input=input)\n", + "\n", + " # Parse function calls separately\n", + " agent_output = parse_messages_to_output_dictionary(result)\n", + "\n", + " return agent_output" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m4CvBuf1afHG" + }, + "source": [ + "### Single tool usage evaluation\n", + "\n", + "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_rS5GGKHd5bx" + }, + "source": [ + "#### Set single tool usage metrics\n", + "\n", + "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", + "\n", + "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xixvq8dwd5by" + }, + "outputs": [], + "source": [ + "single_tool_usage_metrics = [\n", + " TrajectorySingleToolUse(tool_name='get_product_price')\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ktKZoT2Qd5by" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using a local agent (local_1p_agent) and assigns a unique identifier to this specific evaluation run, storing the evaluation results.\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QaMf9dqzySE6" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 700 + }, + "executionInfo": { + "elapsed": 15594, + "status": "ok", + "timestamp": 1734506517045, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "SRv43fDcd5by", + "outputId": "65edaf55-79c4-404c-d5ab-a2e75898e9b8" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/801452371447/locations/us-central1/metadataStores/default/contexts/evaluate-agent-single-metric-eval-s58mdw1j to Experiment: evaluate-agent\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment Run\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 5/5 [00:02<00:00, 1.81it/s]\n", + "INFO:vertexai.preview.evaluation._evaluation:All 5 responses are successfully generated from the runnable.\n", + "INFO:vertexai.preview.evaluation._evaluation:Computing metrics with a total of 5 Vertex Gen AI Evaluation Service API requests.\n", + "100%|██████████| 5/5 [00:04<00:00, 1.23it/s]\n", + "INFO:vertexai.preview.evaluation._evaluation:All 5 metric requests are successfully computed.\n", + "INFO:vertexai.preview.evaluation._evaluation:Evaluation Took:4.098520709000013 seconds\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### Summary Metrics" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"display_eval_report(single_tool_call_eval_result)\",\n \"rows\": 1,\n \"fields\": [\n {\n \"column\": \"row_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 5.0,\n \"max\": 5.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 5.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_single_tool_use/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.6,\n \"max\": 0.6,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.6\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_single_tool_use/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.5477225575051662,\n \"max\": 0.5477225575051662,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.5477225575051662\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 2.1747785195999767,\n \"max\": 2.1747785195999767,\n \"num_unique_values\": 1,\n \"samples\": [\n 2.1747785195999767\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.5584294262336947,\n \"max\": 0.5584294262336947,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.5584294262336947\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
row_counttrajectory_single_tool_use/meantrajectory_single_tool_use/stdlatency_in_seconds/meanlatency_in_seconds/stdfailure/meanfailure/std
05.00.60.5477232.1747790.5584290.00.0
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " row_count trajectory_single_tool_use/mean trajectory_single_tool_use/std \\\n", + "0 5.0 0.6 0.547723 \n", + "\n", + " latency_in_seconds/mean latency_in_seconds/std failure/mean failure/std \n", + "0 2.174779 0.558429 0.0 0.0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### Row-wise Metrics" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"display_eval_report(single_tool_call_eval_result)\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"prompt\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Get product details and price for headphones\",\n \"Get product details for speaker?\",\n \"Get details for usb charger\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"reference_trajectory\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"50\",\n \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n \"A super fast and light usb charger\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 1.4841679319999912,\n \"max\": 2.7480303170000298,\n \"num_unique_values\": 5,\n \"samples\": [\n 2.7480303170000298,\n 2.3126841799999056,\n 2.624170197000012\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 0,\n \"max\": 0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"predicted_trajectory\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_single_tool_use/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5477225575051662,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
promptreference_trajectoryresponselatency_in_secondsfailurepredicted_trajectorytrajectory_single_tool_use/score
0Get price for smartphone[{'tool_name': 'get_product_price', 'tool_inpu...5001.4841680[{'tool_name': 'get_product_price', 'tool_inpu...1.0
1Get product details and price for headphones[{'tool_name': 'get_product_details', 'tool_in...502.748030[{'tool_name': 'get_product_details', 'tool_in...1.0
2Get details for usb charger[{'tool_name': 'get_product_details', 'tool_in...A super fast and light usb charger2.624170[{'tool_name': 'get_product_details', 'tool_in...0.0
3Get product details and price for shoes[{'tool_name': 'get_product_details', 'tool_in...1001.704840[{'tool_name': 'get_product_details', 'tool_in...1.0
4Get product details for speaker?[{'tool_name': 'get_product_details', 'tool_in...A voice-controlled smart speaker that plays mu...2.3126840[{'tool_name': 'get_product_details', 'tool_in...0.0
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " prompt \\\n", + "0 Get price for smartphone \n", + "1 Get product details and price for headphones \n", + "2 Get details for usb charger \n", + "3 Get product details and price for shoes \n", + "4 Get product details for speaker? \n", + "\n", + " reference_trajectory \\\n", + "0 [{'tool_name': 'get_product_price', 'tool_inpu... \n", + "1 [{'tool_name': 'get_product_details', 'tool_in... \n", + "2 [{'tool_name': 'get_product_details', 'tool_in... \n", + "3 [{'tool_name': 'get_product_details', 'tool_in... \n", + "4 [{'tool_name': 'get_product_details', 'tool_in... \n", + "\n", + " response latency_in_seconds \\\n", + "0 500 1.484168 \n", + "1 50 2.74803 \n", + "2 A super fast and light usb charger 2.62417 \n", + "3 100 1.70484 \n", + "4 A voice-controlled smart speaker that plays mu... 2.312684 \n", + "\n", + " failure predicted_trajectory \\\n", + "0 0 [{'tool_name': 'get_product_price', 'tool_inpu... \n", + "1 0 [{'tool_name': 'get_product_details', 'tool_in... \n", + "2 0 [{'tool_name': 'get_product_details', 'tool_in... \n", + "3 0 [{'tool_name': 'get_product_details', 'tool_in... \n", + "4 0 [{'tool_name': 'get_product_details', 'tool_in... \n", + "\n", + " trajectory_single_tool_use/score \n", + "0 1.0 \n", + "1 1.0 \n", + "2 0.0 \n", + "3 1.0 \n", + "4 0.0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "single_tool_call_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=single_tool_usage_metrics,\n", + " experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(runnable=agent_parsed_response,\n", + " experiment_run_name=EXPERIMENT_RUN)\n", + "\n", + "display_eval_report(single_tool_call_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6o5BjSTFKVMS" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Use some helper functions to visualize a sample of evaluation result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 794 + }, + "executionInfo": { + "elapsed": 242, + "status": "ok", + "timestamp": 1734506577360, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "ZkpwPReipekr", + "outputId": "6e8961d7-a66e-49e4-fad6-4d637fdf6dd0" + }, + "outputs": [ + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get price for smartphone
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "500
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Latency In Seconds: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.4841679319999912
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Failure: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Predicted Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Single Tool Use/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get product details and price for headphones
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "50
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Latency In Seconds: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "2.7480303170000298
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Failure: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Predicted Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Single Tool Use/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get details for usb charger
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "A super fast and light usb charger
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Latency In Seconds: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "2.624170197000012
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Failure: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Predicted Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Single Tool Use/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JlujdJpu5Kn6" + }, + "source": [ + "### Trajectory Evaluation\n", + "\n", + "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8s-nHdDJneHM" + }, + "source": [ + "#### Set trajectory metrics\n", + "\n", + "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", + "\n", + "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", + "\n", + "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", + "\n", + "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", + "\n", + "* `trajectory_precision`: proportion of predicted actions present in reference\n", + "\n", + "* `trajectory_recall`: proportion of reference actions present in predicted. \n", + "\n", + "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c32WIS95neHN" + }, + "outputs": [], + "source": [ + "trajectory_metrics = [\n", + " \"trajectory_exact_match\", \"trajectory_in_order_match\", \"trajectory_any_order_match\", \"trajectory_precision\", \"trajectory_recall\"\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF3jhTH3neHN" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 279 + }, + "executionInfo": { + "elapsed": 32287, + "status": "ok", + "timestamp": 1734506644979, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "vOdS7TJUneHN", + "outputId": "d5df1ad4-5ff6-4bec-d15f-eeac5921c601" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/801452371447/locations/us-central1/metadataStores/default/contexts/evaluate-re-agent-trajectory-3b77ede9-8ae8-416b-9fdf-50bab4b99297 to Experiment: evaluate-re-agent-trajectory\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment Run\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 5/5 [00:02<00:00, 1.90it/s]\n", + "INFO:vertexai.preview.evaluation._evaluation:All 5 responses are successfully generated from the runnable.\n", + "INFO:vertexai.preview.evaluation._evaluation:Computing metrics with a total of 25 Vertex Gen AI Evaluation Service API requests.\n", + "100%|██████████| 25/25 [00:24<00:00, 1.04it/s]\n", + "INFO:vertexai.preview.evaluation._evaluation:All 25 metric requests are successfully computed.\n", + "INFO:vertexai.preview.evaluation._evaluation:Evaluation Took:24.113868357 seconds\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "EXPERIMENT_NAME = \"evaluate-re-agent-trajectory\" # @param {type:\"string\"}\n", + "\n", + "trajectory_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=trajectory_metrics,\n", + " experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=agent_parsed_response)\n", + "\n", + "display_eval_report(trajectory_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DBiUI3LyLBtj" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Print and visualize a sample of evaluation results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "executionInfo": { + "elapsed": 483, + "status": "ok", + "timestamp": 1734506658651, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "sLVRdN5llA0h", + "outputId": "457f3216-2323-4bdf-eda5-7a22b88ca54f" + }, + "outputs": [ + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get price for smartphone
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "500
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Latency In Seconds: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "2.57008658299992
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Failure: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Predicted Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Exact Match/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory In Order Match/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Any Order Match/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Precision/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Recall/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get product details and price for headphones
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "50
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Latency In Seconds: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.7254483579999942
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Failure: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Predicted Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Exact Match/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory In Order Match/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Any Order Match/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Precision/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Recall/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get details for usb charger
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "A super fast and light usb charger
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Latency In Seconds: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "2.6286665519999133
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Failure: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Predicted Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Exact Match/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory In Order Match/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Any Order Match/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Precision/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Recall/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 542 + }, + "executionInfo": { + "elapsed": 244, + "status": "ok", + "timestamp": 1734506659132, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "PrxM5sMZYXHP", + "outputId": "40cbb7f1-eb52-4fd7-af16-14897a629f5f" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plot_bar_plot(trajectory_eval_result, title=\"Trajectory Metrics\", metrics=[f'{metric}/mean' for metric in trajectory_metrics])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T8TipU2akHEd" + }, + "source": [ + "### Evaluate final response\n", + "\n", + "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DeK-py7ykkDN" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", + "\n", + "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cyGHGgeVklvz" + }, + "outputs": [], + "source": [ + "response_metrics = [\n", + " 'safety', 'coherence'\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DaBJWcg1kn55" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 279 + }, + "executionInfo": { + "elapsed": 20843, + "status": "ok", + "timestamp": 1734506685051, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "wRb2EC_hknSD", + "outputId": "6207f313-1040-418a-c506-fcae56b4c170" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/801452371447/locations/us-central1/metadataStores/default/contexts/evaluate-re-agent-response-350dc51f-c862-4661-a311-910720d88957 to Experiment: evaluate-re-agent-response\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment Run\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 5/5 [00:01<00:00, 2.63it/s]\n", + "INFO:vertexai.preview.evaluation._evaluation:All 5 responses are successfully generated from the runnable.\n", + "INFO:vertexai.preview.evaluation._evaluation:Computing metrics with a total of 10 Vertex Gen AI Evaluation Service API requests.\n", + "100%|██████████| 10/10 [00:13<00:00, 1.36s/it]\n", + "INFO:vertexai.preview.evaluation._evaluation:All 10 metric requests are successfully computed.\n", + "INFO:vertexai.preview.evaluation._evaluation:Evaluation Took:13.589168556999994 seconds\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "EXPERIMENT_NAME = \"evaluate-re-agent-response\" # @param {type:\"string\"}\n", + "\n", + "response_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=response_metrics,\n", + " experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "response_eval_result = response_eval_task.evaluate(runnable=agent_parsed_response)\n", + "\n", + "display_eval_report(response_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JtewTwiwg9qH" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "executionInfo": { + "elapsed": 240, + "status": "ok", + "timestamp": 1734506703538, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "cy0aRydrp9zW", + "outputId": "325bda1f-a89e-4117-85fe-b7d452b4da87" + }, + "outputs": [ + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get price for smartphone
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "500
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Latency In Seconds: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.4945395349998307
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Failure: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Predicted Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "The response is safe, as it does not contain any hate speech, harassment, dangerous content, or sexually explicit content. It simply states the numeric value '500', which can represent the price of a smartphone, making it relevant to the user's prompt.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Coherence/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "STEP 1: The purpose of the prompt is to get the price of a smartphone. The intended audience is anyone looking to purchase a smartphone.\n", + "STEP 2: The response lacks any discernable structure or logical flow, as it's just a numerical value with no context.\n", + "STEP 3: Local coherence is nonexistent because there are no sentences or paragraphs to evaluate transitions.\n", + "STEP 4: The response doesn't utilize any word choices or syntax, hindering the establishment of connections between ideas.\n", + "STEP 5: Pronouns and references are absent in the response.\n", + "Therefore, the response receives the lowest rating due to the lack of any coherent elements.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Coherence/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get product details and price for headphones
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "50
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Latency In Seconds: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.8972680370000035
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Failure: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Predicted Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "The response only contains the number \"50\", which is not harmful or unsafe. Thus, it does not contain any hate speech, harassment, dangerous content, or sexually explicit content. Therefore, the response is safe.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Coherence/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "The response \"50\" lacks coherence entirely. It provides no product details, no currency for the price, and does not specify what the number 50 refers to. The user asked for product details and a price for headphones, but the response offers an isolated number without any context or relevance to the prompt. There is no logical flow, structure, or cohesion. The response does not fulfill the user's request or provide any meaningful information.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Coherence/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get details for usb charger
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "A super fast and light usb charger
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Latency In Seconds: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.5881808110000293
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Failure: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Predicted Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "The response is safe, as it does not contain any hate speech, harassment, dangerous content, or sexually explicit content.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Coherence/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "STEP 1: The prompt seeks detailed information about a USB charger. The intended audience might be anyone looking to purchase or learn more about USB chargers.\n", + "STEP 2: The response lacks a clear structure or progression of ideas. It consists of a single short, general statement, offering no details. There is no beginning, middle, or end, nor does it define 'super fast' or 'light.'\n", + "STEP 3: As the response is a single sentence, there are no transitions to assess, nor is there a paragraph structure to analyze. The sentence fails to provide details about a USB charger.\n", + "STEP 4: The word choice is simplistic and doesn't effectively convey information. Terms like 'super fast' and 'light' are vague and lack specific details.\n", + "STEP 5: There are no pronouns or references to assess.\n", + "Overall, the response is incoherent as it fails to provide the details requested by the prompt and lacks organization and structure.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Coherence/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ntRBK3Te6PEc" + }, + "source": [ + "### Evaluate generated response conditioned by tool choosing\n", + "\n", + "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", + "\n", + "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4bENwFcd6prX" + }, + "source": [ + "#### Define a custom metric\n", + "\n", + "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", + "\n", + "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "txGEHcg76riI" + }, + "outputs": [], + "source": [ + "criteria = {\n", + " \"Follows trajectory\": (\n", + " \"Evaluate whether the agent's response logically follows from the \"\n", + " \"sequence of actions it took. Consider these sub-points:\\n\"\n", + " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", + " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", + " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", + " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", + " )\n", + " }\n", + "\n", + "pointwise_rating_rubric = {\n", + " \"1\": \"Follows trajectory\",\n", + " \"0\": \"Does not follow trajectory\",\n", + "}\n", + "\n", + "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", + " criteria=criteria,\n", + " rating_rubric=pointwise_rating_rubric,\n", + " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8MJqXu0kikxd" + }, + "source": [ + "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "elapsed": 435, + "status": "ok", + "timestamp": 1734506717333, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "5EL7iEDMikNQ", + "outputId": "8a7fc362-3449-426a-a244-5fd380d219af" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# Instruction\n", + "You are an expert evaluator. Your task is to evaluate the quality of the responses generated by AI models. We will provide you with the user prompt and an AI-generated responses.\n", + "You should first read the user input carefully for analyzing the task, and then evaluate the quality of the responses based on the Criteria provided in the Evaluation section below.\n", + "You will assign the response a rating following the Rating Rubric and Evaluation Steps. Give step by step explanations for your rating, and only choose ratings from the Rating Rubric.\n", + "\n", + "\n", + "# Evaluation\n", + "## Criteria\n", + "Follows trajectory: Evaluate whether the agent's response logically follows from the sequence of actions it took. Consider these sub-points:\n", + " - Does the response reflect the information gathered during the trajectory?\n", + " - Is the response consistent with the goals and constraints of the task?\n", + " - Are there any unexpected or illogical jumps in reasoning?\n", + "Provide specific examples from the trajectory and response to support your evaluation.\n", + "\n", + "## Rating Rubric\n", + "0: Does not follow trajectory\n", + "1: Follows trajectory\n", + "\n", + "## Evaluation Steps\n", + "Step 1: Assess the response in aspects of all criteria provided. Provide assessment according to each criterion.\n", + "Step 2: Score based on the rating rubric. Give a brief rationale to explain your evaluation considering each individual criterion.\n", + "\n", + "\n", + "# User Inputs and AI-generated Response\n", + "## User Inputs\n", + "### predicted_trajectory\n", + "{predicted_trajectory}\n", + "\n", + "### prompt\n", + "{prompt}\n", + "\n", + "\n", + "\n", + "\n", + "## AI-generated Response\n", + "{response}\n" + ] + } + ], + "source": [ + "print(response_follows_trajectory_prompt_template.prompt_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e1djVp7Fi4Yy" + }, + "source": [ + "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nx1xbZD87iMj" + }, + "outputs": [], + "source": [ + "response_follows_trajectory_metric = PointwiseMetric(\n", + " metric=\"response_follows_trajectory\",\n", + " metric_prompt_template=response_follows_trajectory_prompt_template,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1pmxLwTe7Ywv" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "Set new generated response evaluation metrics by including the custom metric.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wrsbVFDd7Ywv" + }, + "outputs": [], + "source": [ + "response_tool_metrics = [\n", + " \"trajectory_exact_match\", \"trajectory_in_order_match\", \"safety\", response_follows_trajectory_metric\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lo-Sza807Ywv" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 891 + }, + "executionInfo": { + "elapsed": 28503, + "status": "ok", + "timestamp": 1734506756916, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "_dkb4gSn7Ywv", + "outputId": "5f647c02-7e90-433c-d4d6-910ea18b7133" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/801452371447/locations/us-central1/metadataStores/default/contexts/evaluate-re-agent-response-by-tools-911730d1-06a8-4bde-9eeb-8f66d51217f8 to Experiment: evaluate-re-agent-response-by-tools\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment Run\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 5/5 [00:01<00:00, 2.56it/s]\n", + "INFO:vertexai.preview.evaluation._evaluation:All 5 responses are successfully generated from the runnable.\n", + "INFO:vertexai.preview.evaluation._evaluation:Computing metrics with a total of 20 Vertex Gen AI Evaluation Service API requests.\n", + "100%|██████████| 20/20 [00:21<00:00, 1.08s/it]\n", + "INFO:vertexai.preview.evaluation._evaluation:All 20 metric requests are successfully computed.\n", + "INFO:vertexai.preview.evaluation._evaluation:Evaluation Took:21.68623241199998 seconds\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### Summary Metrics" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"display_eval_report(response_eval_tool_result)\",\n \"rows\": 1,\n \"fields\": [\n {\n \"column\": \"row_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 5.0,\n \"max\": 5.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 5.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_exact_match/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_exact_match/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_in_order_match/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_in_order_match/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.6,\n \"max\": 0.6,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.6\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.5477225575051662,\n \"max\": 0.5477225575051662,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.5477225575051662\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.634030882800016,\n \"max\": 1.634030882800016,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.634030882800016\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.2428521800798761,\n \"max\": 0.2428521800798761,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.2428521800798761\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
row_counttrajectory_exact_match/meantrajectory_exact_match/stdtrajectory_in_order_match/meantrajectory_in_order_match/stdsafety/meansafety/stdresponse_follows_trajectory/meanresponse_follows_trajectory/stdlatency_in_seconds/meanlatency_in_seconds/stdfailure/meanfailure/std
05.01.00.01.00.01.00.00.60.5477231.6340310.2428520.00.0
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " row_count trajectory_exact_match/mean trajectory_exact_match/std \\\n", + "0 5.0 1.0 0.0 \n", + "\n", + " trajectory_in_order_match/mean trajectory_in_order_match/std safety/mean \\\n", + "0 1.0 0.0 1.0 \n", + "\n", + " safety/std response_follows_trajectory/mean \\\n", + "0 0.0 0.6 \n", + "\n", + " response_follows_trajectory/std latency_in_seconds/mean \\\n", + "0 0.547723 1.634031 \n", + "\n", + " latency_in_seconds/std failure/mean failure/std \n", + "0 0.242852 0.0 0.0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### Row-wise Metrics" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"display_eval_report(response_eval_tool_result)\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"prompt\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Get product details and price for headphones\",\n \"Get product details for speaker?\",\n \"Get details for usb charger\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"reference_trajectory\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"50\",\n \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n \"A super fast and light usb charger\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 1.3765636650000488,\n \"max\": 1.943170352999914,\n \"num_unique_values\": 5,\n \"samples\": [\n 1.8326374470000246,\n 1.3765636650000488,\n 1.5494367260000672\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 0,\n \"max\": 0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"predicted_trajectory\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_exact_match/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.0,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_in_order_match/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.0,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/explanation\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"The response, \\\"50\\\", does not contain any unsafe content. It is a simple numerical response, likely referring to a price, and doesn't exhibit hate speech, harassment, dangerous instructions, or sexually explicit material.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.0,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/explanation\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"The response \\\"50\\\" does not follow the trajectory. The predicted trajectory includes getting product details and price for headphones. The response only provides a number, which is not enough information. It doesn't say 50 what (dollars? euros? units in stock?). Additionally, we don't get any details as requested. Therefore, the response does not reflect the information that should have been gathered during the trajectory and thus doesn't follow it.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5477225575051662,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
promptreference_trajectoryresponselatency_in_secondsfailurepredicted_trajectorytrajectory_exact_match/scoretrajectory_in_order_match/scoresafety/explanationsafety/scoreresponse_follows_trajectory/explanationresponse_follows_trajectory/score
0Get price for smartphone[{'tool_name': 'get_product_price', 'tool_inpu...5001.4683460[{'tool_name': 'get_product_price', 'tool_inpu...1.01.0The response is a simple price and does not co...1.0The AI's response follows the trajectory becau...1.0
1Get product details and price for headphones[{'tool_name': 'get_product_details', 'tool_in...501.8326370[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response, \"50\", does not contain any unsaf...1.0The response \"50\" does not follow the trajecto...0.0
2Get details for usb charger[{'tool_name': 'get_product_details', 'tool_in...A super fast and light usb charger1.5494370[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response is safe. It does not contain any ...1.0The response \"A super fast and light usb charg...1.0
3Get product details and price for shoes[{'tool_name': 'get_product_details', 'tool_in...1001.943170[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response is a single number and does not p...1.0The response \"100\" does not follow the traject...0.0
4Get product details for speaker?[{'tool_name': 'get_product_details', 'tool_in...A voice-controlled smart speaker that plays mu...1.3765640[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response is safe. It provides information ...1.0The response provides a high-level description...1.0
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " prompt \\\n", + "0 Get price for smartphone \n", + "1 Get product details and price for headphones \n", + "2 Get details for usb charger \n", + "3 Get product details and price for shoes \n", + "4 Get product details for speaker? \n", + "\n", + " reference_trajectory \\\n", + "0 [{'tool_name': 'get_product_price', 'tool_inpu... \n", + "1 [{'tool_name': 'get_product_details', 'tool_in... \n", + "2 [{'tool_name': 'get_product_details', 'tool_in... \n", + "3 [{'tool_name': 'get_product_details', 'tool_in... \n", + "4 [{'tool_name': 'get_product_details', 'tool_in... \n", + "\n", + " response latency_in_seconds \\\n", + "0 500 1.468346 \n", + "1 50 1.832637 \n", + "2 A super fast and light usb charger 1.549437 \n", + "3 100 1.94317 \n", + "4 A voice-controlled smart speaker that plays mu... 1.376564 \n", + "\n", + " failure predicted_trajectory \\\n", + "0 0 [{'tool_name': 'get_product_price', 'tool_inpu... \n", + "1 0 [{'tool_name': 'get_product_details', 'tool_in... \n", + "2 0 [{'tool_name': 'get_product_details', 'tool_in... \n", + "3 0 [{'tool_name': 'get_product_details', 'tool_in... \n", + "4 0 [{'tool_name': 'get_product_details', 'tool_in... \n", + "\n", + " trajectory_exact_match/score trajectory_in_order_match/score \\\n", + "0 1.0 1.0 \n", + "1 1.0 1.0 \n", + "2 1.0 1.0 \n", + "3 1.0 1.0 \n", + "4 1.0 1.0 \n", + "\n", + " safety/explanation safety/score \\\n", + "0 The response is a simple price and does not co... 1.0 \n", + "1 The response, \"50\", does not contain any unsaf... 1.0 \n", + "2 The response is safe. It does not contain any ... 1.0 \n", + "3 The response is a single number and does not p... 1.0 \n", + "4 The response is safe. It provides information ... 1.0 \n", + "\n", + " response_follows_trajectory/explanation \\\n", + "0 The AI's response follows the trajectory becau... \n", + "1 The response \"50\" does not follow the trajecto... \n", + "2 The response \"A super fast and light usb charg... \n", + "3 The response \"100\" does not follow the traject... \n", + "4 The response provides a high-level description... \n", + "\n", + " response_follows_trajectory/score \n", + "0 1.0 \n", + "1 0.0 \n", + "2 1.0 \n", + "3 0.0 \n", + "4 1.0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "EXPERIMENT_NAME = \"evaluate-re-agent-response-by-tools\" # @param {type:\"string\"}\n", + "\n", + "response_eval_tool_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "response_eval_tool_result = response_eval_tool_task.evaluate(runnable=agent_parsed_response)\n", + "\n", + "display_eval_report(response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EFmnRBlWqJnC" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "executionInfo": { + "elapsed": 239, + "status": "ok", + "timestamp": 1734506757152, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "ZODTRuq2lF75", + "outputId": "7dd44083-885d-4811-89b9-25abc88e95de" + }, + "outputs": [ + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get price for smartphone
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "500
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Latency In Seconds: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.4945395349998307
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Failure: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Predicted Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "The response is safe, as it does not contain any hate speech, harassment, dangerous content, or sexually explicit content. It simply states the numeric value '500', which can represent the price of a smartphone, making it relevant to the user's prompt.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Coherence/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "STEP 1: The purpose of the prompt is to get the price of a smartphone. The intended audience is anyone looking to purchase a smartphone.\n", + "STEP 2: The response lacks any discernable structure or logical flow, as it's just a numerical value with no context.\n", + "STEP 3: Local coherence is nonexistent because there are no sentences or paragraphs to evaluate transitions.\n", + "STEP 4: The response doesn't utilize any word choices or syntax, hindering the establishment of connections between ideas.\n", + "STEP 5: Pronouns and references are absent in the response.\n", + "Therefore, the response receives the lowest rating due to the lack of any coherent elements.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Coherence/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get product details and price for headphones
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "50
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Latency In Seconds: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.8972680370000035
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Failure: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Predicted Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "The response only contains the number \"50\", which is not harmful or unsafe. Thus, it does not contain any hate speech, harassment, dangerous content, or sexually explicit content. Therefore, the response is safe.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Coherence/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "The response \"50\" lacks coherence entirely. It provides no product details, no currency for the price, and does not specify what the number 50 refers to. The user asked for product details and a price for headphones, but the response offers an isolated number without any context or relevance to the prompt. There is no logical flow, structure, or cohesion. The response does not fulfill the user's request or provide any meaningful information.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Coherence/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get details for usb charger
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "A super fast and light usb charger
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Latency In Seconds: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.5881808110000293
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Failure: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Predicted Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "The response is safe, as it does not contain any hate speech, harassment, dangerous content, or sexually explicit content.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Coherence/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "STEP 1: The prompt seeks detailed information about a USB charger. The intended audience might be anyone looking to purchase or learn more about USB chargers.\n", + "STEP 2: The response lacks a clear structure or progression of ideas. It consists of a single short, general statement, offering no details. There is no beginning, middle, or end, nor does it define 'super fast' or 'light.'\n", + "STEP 3: As the response is a single sentence, there are no transitions to assess, nor is there a paragraph structure to analyze. The sentence fails to provide details about a USB charger.\n", + "STEP 4: The word choice is simplistic and doesn't effectively convey information. Terms like 'super fast' and 'light' are vague and lack specific details.\n", + "STEP 5: There are no pronouns or references to assess.\n", + "Overall, the response is incoherent as it fails to provide the details requested by the prompt and lacks organization and structure.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Coherence/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4nuUDP3a2eTB" + }, + "source": [ + "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", + "\n", + "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pNh3-NDuZGDl" + }, + "source": [ + "### Bring your own evaluation dataset\n", + "\n", + "Define the evaluation dataset with the predicted trajectory and the generated response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y9hBgsg324Ej" + }, + "outputs": [], + "source": [ + "byod_eval_data = {\n", + " 'prompt': [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\"\n", + " ],\n", + " 'reference_trajectory': [\n", + " [{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}],\n", + " [\n", + " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}},\n", + " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}\n", + " ],\n", + " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}],\n", + " [\n", + " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'shoes'}},\n", + " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'shoes'}}\n", + " ],\n", + " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'speaker'}}]\n", + " ],\n", + " 'generated_trajectory': [\n", + " [{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}],\n", + " [\n", + " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}},\n", + " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}\n", + " ],\n", + " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}],\n", + " [\n", + " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'shoes'}},\n", + " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'shoes'}}\n", + " ],\n", + " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'speaker'}}]\n", + " ],\n", + " 'response': [500, 50, 'A super fast and light usb charger', 100, 'A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.']\n", + "\n", + "}\n", + "\n", + "byod_eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oEYmU2eJ7q-1" + }, + "source": [ + "### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 891 + }, + "executionInfo": { + "elapsed": 30095, + "status": "ok", + "timestamp": 1734506845575, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "wBD-4wpB7q-3", + "outputId": "a0db90f4-0a90-4635-ee18-3a8479a71658" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/801452371447/locations/us-central1/metadataStores/default/contexts/evaluate-re-agent-response-by-tools-response-over-tools-byod-crxo2pye to Experiment: evaluate-re-agent-response-by-tools\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment Run\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 5/5 [00:02<00:00, 1.93it/s]\n", + "INFO:vertexai.preview.evaluation._evaluation:All 5 responses are successfully generated from the runnable.\n", + "INFO:vertexai.preview.evaluation._evaluation:Computing metrics with a total of 20 Vertex Gen AI Evaluation Service API requests.\n", + "100%|██████████| 20/20 [00:22<00:00, 1.12s/it]\n", + "INFO:vertexai.preview.evaluation._evaluation:All 20 metric requests are successfully computed.\n", + "INFO:vertexai.preview.evaluation._evaluation:Evaluation Took:22.457164905000127 seconds\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### Summary Metrics" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"display_eval_report(byod_response_eval_tool_result)\",\n \"rows\": 1,\n \"fields\": [\n {\n \"column\": \"row_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 5.0,\n \"max\": 5.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 5.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_exact_match/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_exact_match/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_in_order_match/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_in_order_match/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.4,\n \"max\": 0.4,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.5477225575051662,\n \"max\": 0.5477225575051662,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.5477225575051662\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.8266308515999754,\n \"max\": 1.8266308515999754,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.8266308515999754\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.4469010777924883,\n \"max\": 0.4469010777924883,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.4469010777924883\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
row_counttrajectory_exact_match/meantrajectory_exact_match/stdtrajectory_in_order_match/meantrajectory_in_order_match/stdsafety/meansafety/stdresponse_follows_trajectory/meanresponse_follows_trajectory/stdlatency_in_seconds/meanlatency_in_seconds/stdfailure/meanfailure/std
05.01.00.01.00.01.00.00.40.5477231.8266310.4469010.00.0
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " row_count trajectory_exact_match/mean trajectory_exact_match/std \\\n", + "0 5.0 1.0 0.0 \n", + "\n", + " trajectory_in_order_match/mean trajectory_in_order_match/std safety/mean \\\n", + "0 1.0 0.0 1.0 \n", + "\n", + " safety/std response_follows_trajectory/mean \\\n", + "0 0.0 0.4 \n", + "\n", + " response_follows_trajectory/std latency_in_seconds/mean \\\n", + "0 0.547723 1.826631 \n", + "\n", + " latency_in_seconds/std failure/mean failure/std \n", + "0 0.446901 0.0 0.0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### Row-wise Metrics" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"display_eval_report(byod_response_eval_tool_result)\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"prompt\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Get product details and price for headphones\",\n \"Get product details for speaker?\",\n \"Get details for usb charger\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"reference_trajectory\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"50\",\n \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n \"A super fast and light usb charger\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 1.41932438799995,\n \"max\": 2.585738198999934,\n \"num_unique_values\": 5,\n \"samples\": [\n 1.7416313100000025,\n 1.41932438799995,\n 2.585738198999934\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 0,\n \"max\": 0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"predicted_trajectory\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_exact_match/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.0,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_in_order_match/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.0,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/explanation\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"The response is a simple \\\"50\\\" which is not harmful or unsafe in any way. It could be interpreted as a price, relevant to the prompt, but lacks detail. There is no hate speech, harassment, dangerous or sexually explicit content.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.0,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/explanation\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"The AI response \\\"50\\\" does not follow the trajectory. The predicted trajectory includes getting product details and price for headphones. Returning only \\\"50\\\" does not reflect the information that should have been gathered. It only provides what could be interpreted as a price, but lacks the product details and context.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5477225575051662,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
promptreference_trajectoryresponselatency_in_secondsfailurepredicted_trajectorytrajectory_exact_match/scoretrajectory_in_order_match/scoresafety/explanationsafety/scoreresponse_follows_trajectory/explanationresponse_follows_trajectory/score
0Get price for smartphone[{'tool_name': 'get_product_price', 'tool_inpu...5001.6097930[{'tool_name': 'get_product_price', 'tool_inpu...1.01.0The response is safe, as it does not contain a...1.0The AI's response follows the trajectory set b...1.0
1Get product details and price for headphones[{'tool_name': 'get_product_details', 'tool_in...501.7416310[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response is a simple \"50\" which is not har...1.0The AI response \"50\" does not follow the traje...0.0
2Get details for usb charger[{'tool_name': 'get_product_details', 'tool_in...A super fast and light usb charger2.5857380[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response is safe, as it does not contain a...1.0The AI's response does not follow the predicte...0.0
3Get product details and price for shoes[{'tool_name': 'get_product_details', 'tool_in...1001.7766670[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response is a number which doesn't promote...1.0The response \"100\" does not follow the traject...0.0
4Get product details for speaker?[{'tool_name': 'get_product_details', 'tool_in...A voice-controlled smart speaker that plays mu...1.4193240[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response is safe, as it does not contain a...1.0The response \"A voice-controlled smart speaker...1.0
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " prompt \\\n", + "0 Get price for smartphone \n", + "1 Get product details and price for headphones \n", + "2 Get details for usb charger \n", + "3 Get product details and price for shoes \n", + "4 Get product details for speaker? \n", + "\n", + " reference_trajectory \\\n", + "0 [{'tool_name': 'get_product_price', 'tool_inpu... \n", + "1 [{'tool_name': 'get_product_details', 'tool_in... \n", + "2 [{'tool_name': 'get_product_details', 'tool_in... \n", + "3 [{'tool_name': 'get_product_details', 'tool_in... \n", + "4 [{'tool_name': 'get_product_details', 'tool_in... \n", + "\n", + " response latency_in_seconds \\\n", + "0 500 1.609793 \n", + "1 50 1.741631 \n", + "2 A super fast and light usb charger 2.585738 \n", + "3 100 1.776667 \n", + "4 A voice-controlled smart speaker that plays mu... 1.419324 \n", + "\n", + " failure predicted_trajectory \\\n", + "0 0 [{'tool_name': 'get_product_price', 'tool_inpu... \n", + "1 0 [{'tool_name': 'get_product_details', 'tool_in... \n", + "2 0 [{'tool_name': 'get_product_details', 'tool_in... \n", + "3 0 [{'tool_name': 'get_product_details', 'tool_in... \n", + "4 0 [{'tool_name': 'get_product_details', 'tool_in... \n", + "\n", + " trajectory_exact_match/score trajectory_in_order_match/score \\\n", + "0 1.0 1.0 \n", + "1 1.0 1.0 \n", + "2 1.0 1.0 \n", + "3 1.0 1.0 \n", + "4 1.0 1.0 \n", + "\n", + " safety/explanation safety/score \\\n", + "0 The response is safe, as it does not contain a... 1.0 \n", + "1 The response is a simple \"50\" which is not har... 1.0 \n", + "2 The response is safe, as it does not contain a... 1.0 \n", + "3 The response is a number which doesn't promote... 1.0 \n", + "4 The response is safe, as it does not contain a... 1.0 \n", + "\n", + " response_follows_trajectory/explanation \\\n", + "0 The AI's response follows the trajectory set b... \n", + "1 The AI response \"50\" does not follow the traje... \n", + "2 The AI's response does not follow the predicte... \n", + "3 The response \"100\" does not follow the traject... \n", + "4 The response \"A voice-controlled smart speaker... \n", + "\n", + " response_follows_trajectory/score \n", + "0 1.0 \n", + "1 0.0 \n", + "2 0.0 \n", + "3 0.0 \n", + "4 1.0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", + "\n", + "byod_response_eval_tool_task = EvalTask(\n", + " dataset=byod_eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(runnable=agent_parsed_response,\n", + " experiment_run_name=EXPERIMENT_RUN_NAME)\n", + "\n", + "display_eval_report(byod_response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9eU3LG6r7q-3" + }, + "source": [ + "### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "executionInfo": { + "elapsed": 253, + "status": "ok", + "timestamp": 1734506845825, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "pQFzmd2I7q-3", + "outputId": "24e51c3b-e104-471d-e15f-e63d925d0fd7" + }, + "outputs": [ + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get price for smartphone
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "500
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Latency In Seconds: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.6097934590000023
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Failure: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Predicted Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Exact Match/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory In Order Match/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "The response is safe, as it does not contain any hate speech, harassment, dangerous content, or sexually explicit content.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response Follows Trajectory/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "The AI's response follows the trajectory set by the predicted trajectory. The trajectory indicates that the AI should use the \"get_product_price\" tool with \"smartphone\" as input. Based on the response \"500\", we can infer that the AI successfully executed this action and returned the price. Thus, the response directly reflects the information gathered during the trajectory by using the specified tool and input. The response is consistent with the user's prompt to \"Get price for smartphone\" as it provides a numerical value which can be interpreted as a price. There are no unexpected jumps in reasoning, making the response logical and relevant. Therefore, a rating of \"1\" is assigned, indicating that the AI's response follows the trajectory.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response Follows Trajectory/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get product details and price for headphones
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "50
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Latency In Seconds: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.7416313100000025
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Failure: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Predicted Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Exact Match/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory In Order Match/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "The response is a simple \"50\" which is not harmful or unsafe in any way. It could be interpreted as a price, relevant to the prompt, but lacks detail. There is no hate speech, harassment, dangerous or sexually explicit content.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response Follows Trajectory/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "The AI response \"50\" does not follow the trajectory. The predicted trajectory includes getting product details and price for headphones. Returning only \"50\" does not reflect the information that should have been gathered. It only provides what could be interpreted as a price, but lacks the product details and context.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response Follows Trajectory/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get details for usb charger
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "A super fast and light usb charger
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Latency In Seconds: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "2.585738198999934
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Failure: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Predicted Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Exact Match/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory In Order Match/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "The response is safe, as it does not contain any hate speech, harassment, dangerous content, or sexually explicit content.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response Follows Trajectory/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "The AI's response does not follow the predicted trajectory, which suggests using the \"get_product_details\" tool to look up information on a USB charger. Instead of returning product information, the AI offers a generic description: \"A super fast and light usb charger.\" This response doesn't demonstrate use of the tool or retrieval of specific product details.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response Follows Trajectory/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pQUZRGb3rLC0" + }, + "outputs": [], + "source": [ + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 542 + }, + "executionInfo": { + "elapsed": 2, + "status": "ok", + "timestamp": 1734507187235, + "user": { + "displayName": "Ivan Nardini", + "userId": "04192340647469915671" + }, + "user_tz": -120 + }, + "id": "0FEbvEOkZS8f", + "outputId": "94a95394-05fe-47f4-ce9c-301ce311bcf5" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display_radar_plot(byod_response_eval_tool_result, title=\"Response Metrics\", metrics=[f'{metric}/mean' for metric in response_tool_metrics])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2a4e033321ad" + }, + "source": [ + "## Cleaning up\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ox2I3UfRlTOd" + }, + "outputs": [], + "source": [ + "delete_experiment=True\n", + "delete_remote_agent=True\n", + "\n", + "if delete_experiment:\n", + " try:\n", + " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", + " experiment.delete(delete_backing_tensorboard_runs=True)\n", + " except Exception as e:\n", + " print(e)\n", + "\n", + "if delete_remote_agent:\n", + " try:\n", + " remote_custom_agent.delete()\n", + " except Exception as e:\n", + " print(e)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [], + "toc_visible": true + }, + "environment": { + "kernel": "python3", + "name": "common-cpu.m126", + "type": "gcloud", + "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/base-cpu:m126" + }, + "kernelspec": { + "display_name": "Python 3 (Local)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From a380a46bc932ea6d7b825e026666deecb81e9ff5 Mon Sep 17 00:00:00 2001 From: inardini Date: Wed, 18 Dec 2024 09:51:34 +0000 Subject: [PATCH 02/14] update links --- .../evaluation/evaluating_crewai_agent.ipynb | 737 +++++++---------- .../evaluating_langgraph_agent.ipynb | 783 +++++++----------- ...reasoning_engine_customized_template.ipynb | 765 +++++++---------- ...t_reasoning_engine_prebuilt_template.ipynb | 722 ++++++---------- ...reasoning_engine_customized_template.ipynb | 783 +++++++----------- 5 files changed, 1419 insertions(+), 2371 deletions(-) diff --git a/gemini/evaluation/evaluating_crewai_agent.ipynb b/gemini/evaluation/evaluating_crewai_agent.ipynb index 0b3cec2596..c588b87a11 100644 --- a/gemini/evaluation/evaluating_crewai_agent.ipynb +++ b/gemini/evaluation/evaluating_crewai_agent.ipynb @@ -33,22 +33,22 @@ "\n", "\n", " \n", " \n", " \n", " \n", @@ -58,23 +58,23 @@ "\n", "Share to:\n", "\n", - "\n", + "\n", " \"LinkedIn\n", "\n", "\n", - "\n", + "\n", " \"Bluesky\n", "\n", "\n", - "\n", + "\n", " \"X\n", "\n", "\n", - "\n", + "\n", " \"Reddit\n", "\n", "\n", - "\n", + "\n", " \"Facebook\n", "" ] @@ -114,8 +114,7 @@ "* Prepare Agent Evaluation dataset\n", "* Single tool usage evaluation\n", "* Trajectory evaluation\n", - "* Response evaluation\n", - "\n" + "* Response evaluation\n" ] }, { @@ -140,21 +139,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 13355, - "status": "ok", - "timestamp": 1734464541030, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "tFy3H3aPgx12", - "outputId": "12b6f569-a1ba-41f2-ef43-4a7102ce0661" + "id": "tFy3H3aPgx12" }, "outputs": [], "source": [ @@ -182,21 +167,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 220, - "status": "ok", - "timestamp": 1734464586580, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "XRvKdaPDTznN", - "outputId": "de2a6a54-dbbc-45ac-d488-3f3b31972325" + "id": "XRvKdaPDTznN" }, "outputs": [], "source": [ @@ -261,38 +232,24 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 57 - }, - "executionInfo": { - "elapsed": 12971, - "status": "ok", - "timestamp": 1734464619145, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "Nqwi-5ufWp_B", - "outputId": "9f529389-522f-4e1e-c41e-17578ef0ac74" + "id": "Nqwi-5ufWp_B" }, "outputs": [], "source": [ "# Use the environment variable if the user doesn't provide Project ID.\n", "import os\n", + "\n", "import vertexai\n", "\n", "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", "\n", "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", - "os.environ['GOOGLE_CLOUD_PROJECT'] = PROJECT_ID\n", + "os.environ[\"GOOGLE_CLOUD_PROJECT\"] = PROJECT_ID\n", "\n", "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", "\n", - "EXPERIMENT_NAME = \"evaluate-crewai-agent\" # @param {type:\"string\"}\n", + "EXPERIMENT_NAME = \"evaluate-crewai-agent\" # @param {type:\"string\"}\n", "\n", "vertexai.init(project=PROJECT_ID, location=LOCATION, experiment=EXPERIMENT_NAME)" ] @@ -319,22 +276,27 @@ "# General\n", "import random\n", "import string\n", + "import warnings\n", + "\n", + "from IPython.display import HTML, Markdown, display\n", "import pandas as pd\n", "import plotly.graph_objects as go\n", - "from IPython.display import display, Markdown, HTML\n", - "from typing import Literal\n", - "import warnings\n", - "warnings.filterwarnings('ignore', category=Warning, module='opentelemetry.trace')\n", + "\n", + "warnings.filterwarnings(\"ignore\", category=Warning, module=\"opentelemetry.trace\")\n", "\n", "# Build agent\n", - "from crewai import Agent, Task, Crew, Process\n", - "from crewai_tools import tool\n", + "from crewai import Agent, Crew, Process, Task\n", "from crewai.flow.flow import Flow, listen, start\n", + "from crewai_tools import tool\n", "\n", "# Evaluate agent\n", "from google.cloud import aiplatform\n", "from vertexai.preview.evaluation import EvalTask\n", - "from vertexai.preview.evaluation.metrics import TrajectorySingleToolUse, PointwiseMetricPromptTemplate, PointwiseMetric" + "from vertexai.preview.evaluation.metrics import (\n", + " PointwiseMetric,\n", + " PointwiseMetricPromptTemplate,\n", + " TrajectorySingleToolUse,\n", + ")" ] }, { @@ -360,86 +322,106 @@ " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", "\n", + "\n", "def parse_crewai_output_to_dictionary(crew, crew_output):\n", " \"\"\"\n", " Parse CrewAI output into a structured dictionary format.\n", " \"\"\"\n", - " final_output = {\n", - " 'response': str(crew_output),\n", - " 'predicted_trajectory': []\n", - " }\n", + " final_output = {\"response\": str(crew_output), \"predicted_trajectory\": []}\n", "\n", " try:\n", " # Access tools_results directly from each agent\n", " for agent in crew.agents:\n", - " if hasattr(agent, 'tools_results'):\n", + " if hasattr(agent, \"tools_results\"):\n", " for tool_result in agent.tools_results:\n", " tool_info = {\n", - " 'tool_name': tool_result.get('tool_name', ''),\n", - " 'tool_input': tool_result.get('tool_args', {})\n", + " \"tool_name\": tool_result.get(\"tool_name\", \"\"),\n", + " \"tool_input\": tool_result.get(\"tool_args\", {}),\n", " }\n", - " final_output['predicted_trajectory'].append(tool_info)\n", + " final_output[\"predicted_trajectory\"].append(tool_info)\n", "\n", " except Exception as e:\n", - " final_output['error'] = f\"Error parsing tools results: {str(e)}\"\n", + " final_output[\"error\"] = f\"Error parsing tools results: {str(e)}\"\n", "\n", " return final_output\n", "\n", + "\n", "def format_output_as_markdown(output: dict) -> str:\n", " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", " markdown = \"### AI Response\\n\"\n", " markdown += f\"{output['response']}\\n\\n\"\n", "\n", - " if output['predicted_trajectory']:\n", + " if output[\"predicted_trajectory\"]:\n", " markdown += \"### Function Calls\\n\"\n", - " for call in output['predicted_trajectory']:\n", + " for call in output[\"predicted_trajectory\"]:\n", " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", " markdown += \" - **Arguments**:\\n\"\n", - " for key, value in call['tool_input'].items():\n", + " for key, value in call[\"tool_input\"].items():\n", " markdown += f\" - `{key}`: `{value}`\\n\"\n", "\n", " return markdown\n", "\n", + "\n", "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", " \"\"\"Display the evaluation results.\"\"\"\n", - " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient='index').T\n", + " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", " display(Markdown(\"### Summary Metrics\"))\n", " display(metrics_df)\n", "\n", " display(Markdown(f\"### Row-wise Metrics\"))\n", " display(eval_result.metrics_table)\n", "\n", + "\n", "def display_drilldown(row: pd.Series) -> None:\n", " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", "\n", " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", "\n", - " if not (isinstance(row['predicted_trajectory'], list) and isinstance(row['reference_trajectory'], list)):\n", - " return\n", - "\n", - " for predicted_trajectory, reference_trajectory in zip(row['predicted_trajectory'], row['reference_trajectory']):\n", - " display(HTML(f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"))\n", - "\n", - "\n", - " if not (isinstance(predicted_trajectory.get('tool_input'), dict) and isinstance(reference_trajectory.get('tool_input'), dict)):\n", - " continue\n", - "\n", - "\n", - " for tool_input_key in predicted_trajectory['tool_input']:\n", - " print(\"Tool Input Key: \", tool_input_key)\n", + " if not (\n", + " isinstance(row[\"predicted_trajectory\"], list)\n", + " and isinstance(row[\"reference_trajectory\"], list)\n", + " ):\n", + " return\n", + "\n", + " for predicted_trajectory, reference_trajectory in zip(\n", + " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", + " ):\n", + " display(\n", + " HTML(\n", + " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", + " )\n", + " )\n", "\n", - " if tool_input_key in reference_trajectory['tool_input']:\n", - " print(\"Tool Values: \", predicted_trajectory['tool_input'][tool_input_key], reference_trajectory['tool_input'][tool_input_key])\n", - " else:\n", - " print(\"Tool Values: \", predicted_trajectory['tool_input'][tool_input_key], \"N/A\")\n", - " print(\"\\n\")\n", + " if not (\n", + " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", + " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", + " ):\n", + " continue\n", + "\n", + " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", + " print(\"Tool Input Key: \", tool_input_key)\n", + "\n", + " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " reference_trajectory[\"tool_input\"][tool_input_key],\n", + " )\n", + " else:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " \"N/A\",\n", + " )\n", + " print(\"\\n\")\n", " display(HTML(\"
\"))\n", "\n", + "\n", "def display_dataframe_rows(\n", " df: pd.DataFrame,\n", " columns: list[str] | None = None,\n", " num_rows: int = 3,\n", - " display_drilldown: bool = False\n", + " display_drilldown: bool = False,\n", ") -> None:\n", " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", "\n", @@ -451,13 +433,22 @@ "\n", " for _, row in df.head(num_rows).iterrows():\n", " for column in df.columns:\n", - " display(HTML(f\"{column.replace('_', ' ').title()}: \"))\n", + " display(\n", + " HTML(\n", + " f\"{column.replace('_', ' ').title()}: \"\n", + " )\n", + " )\n", " display(HTML(f\"{row[column]}
\"))\n", "\n", " display(HTML(\"
\"))\n", "\n", - " if display_drilldown and 'predicted_trajectory' in df.columns and 'reference_trajectory' in df.columns:\n", - " display_drilldown(row)\n", + " if (\n", + " display_drilldown\n", + " and \"predicted_trajectory\" in df.columns\n", + " and \"reference_trajectory\" in df.columns\n", + " ):\n", + " display_drilldown(row)\n", + "\n", "\n", "def plot_bar_plot(\n", " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", @@ -487,6 +478,7 @@ " fig.update_layout(barmode=\"group\")\n", " fig.show()\n", "\n", + "\n", "def display_radar_plot(eval_results, title: str, metrics=None):\n", " \"\"\"Plot the radar plot.\"\"\"\n", " fig = go.Figure()\n", @@ -512,7 +504,7 @@ " fig.update_layout(\n", " title=title,\n", " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", - " showlegend=True\n", + " showlegend=True,\n", " )\n", " fig.show()" ] @@ -606,7 +598,7 @@ " tool_calls = last_message.get(\"tool_calls\", [])\n", "\n", " if tool_calls:\n", - " function_name = tool_calls[0].get('name')\n", + " function_name = tool_calls[0].get(\"name\")\n", " if function_name == \"get_product_price\":\n", " return \"get_product_price\"\n", " else:\n", @@ -659,19 +651,19 @@ "source": [ "def agent_parsed_outcome(input):\n", " product_researcher = Agent(\n", - " role='Product Researcher',\n", - " goal='Research product details and prices accurately',\n", - " backstory='Expert at gathering and analyzing product information',\n", + " role=\"Product Researcher\",\n", + " goal=\"Research product details and prices accurately\",\n", + " backstory=\"Expert at gathering and analyzing product information\",\n", " llm=model,\n", " tools=[get_product_details, get_product_price],\n", - " allow_delegation=False\n", + " allow_delegation=False,\n", " )\n", "\n", " # Create task based on the input\n", " research_task = Task(\n", " description=f\"Analyze this user request: '{input}'. \"\n", - " f\"If the request is about price, use get_product_price tool. \"\n", - " f\"Otherwise, use get_product_details tool to get product information.\",\n", + " f\"If the request is about price, use get_product_price tool. \"\n", + " f\"Otherwise, use get_product_details tool to get product information.\",\n", " expected_output=\"Product information including details and/or price based on the user request.\",\n", " agent=product_researcher,\n", " )\n", @@ -702,22 +694,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 190 - }, - "executionInfo": { - "elapsed": 6889, - "status": "ok", - "timestamp": 1734464661380, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "lGb58OJkjUs9", - "outputId": "f63e6f69-06b8-4818-e881-fda0fda5c474" + "id": "lGb58OJkjUs9" }, "outputs": [], "source": [ @@ -729,22 +706,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 207 - }, - "executionInfo": { - "elapsed": 4501, - "status": "ok", - "timestamp": 1734464665879, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "2wCFstt8w4Dx", - "outputId": "0d159ba7-7872-4b6e-d7cb-b505810ab1bb" + "id": "2wCFstt8w4Dx" }, "outputs": [], "source": [ @@ -805,26 +767,50 @@ "outputs": [], "source": [ "eval_data = {\n", - " 'prompt': [\n", + " \"prompt\": [\n", " \"Get price for smartphone\",\n", " \"Get product details and price for headphones\",\n", " \"Get details for usb charger\",\n", " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\"\n", + " \"Get product details for speaker?\",\n", " ],\n", - " 'reference_trajectory': [\n", - " [{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}],\n", + " \"reference_trajectory\": [\n", " [\n", - " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}},\n", - " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", " ],\n", - " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}],\n", " [\n", - " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'shoes'}},\n", - " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'shoes'}}\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", " ],\n", - " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'speaker'}}]\n", - " ]\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", "}\n", "\n", "eval_sample_dataset = pd.DataFrame(eval_data)" @@ -843,22 +829,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 273 - }, - "executionInfo": { - "elapsed": 242, - "status": "ok", - "timestamp": 1734464674834, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "EjsonqWWvIvE", - "outputId": "8e466625-dd68-4d8d-fb30-8ee5969c2d5d" + "id": "EjsonqWWvIvE" }, "outputs": [], "source": [ @@ -873,8 +844,7 @@ "source": [ "### Single tool usage evaluation\n", "\n", - "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n", - "\n" + "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" ] }, { @@ -898,9 +868,7 @@ }, "outputs": [], "source": [ - "single_tool_usage_metrics = [\n", - " TrajectorySingleToolUse(tool_name='get_product_price')\n", - "]" + "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" ] }, { @@ -911,32 +879,14 @@ "source": [ "#### Run an evaluation task\n", "\n", - "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using agent_parsed_outcome function and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.\n", - "\n", - "\n", - "\n" + "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using agent_parsed_outcome function and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 680 - }, - "executionInfo": { - "elapsed": 22544, - "status": "ok", - "timestamp": 1734464702337, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "SRv43fDcd5by", - "outputId": "cc26aba6-9bab-4a01-9e27-51da533b2e62" + "id": "SRv43fDcd5by" }, "outputs": [], "source": [ @@ -945,11 +895,12 @@ "single_tool_call_eval_task = EvalTask(\n", " dataset=eval_sample_dataset,\n", " metrics=single_tool_usage_metrics,\n", - " experiment=EXPERIMENT_NAME\n", + " experiment=EXPERIMENT_NAME,\n", ")\n", "\n", - "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(runnable=agent_parsed_outcome,\n", - " experiment_run_name=EXPERIMENT_RUN)\n", + "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", "\n", "display_eval_report(single_tool_call_eval_result)" ] @@ -969,22 +920,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 794 - }, - "executionInfo": { - "elapsed": 230, - "status": "ok", - "timestamp": 1734464841587, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "1Jopzw83k14w", - "outputId": "4b2963d7-5fad-4c83-c78f-88b2df7c407b" + "id": "1Jopzw83k14w" }, "outputs": [], "source": [ @@ -1034,7 +970,11 @@ "outputs": [], "source": [ "trajectory_metrics = [\n", - " \"trajectory_exact_match\", \"trajectory_in_order_match\", \"trajectory_any_order_match\", \"trajectory_precision\", \"trajectory_recall\"\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"trajectory_any_order_match\",\n", + " \"trajectory_precision\",\n", + " \"trajectory_recall\",\n", "]" ] }, @@ -1053,35 +993,19 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 943 - }, - "executionInfo": { - "elapsed": 43240, - "status": "ok", - "timestamp": 1734465074991, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "vOdS7TJUneHN", - "outputId": "c652c7d5-6723-45d8-8744-9a22cc6f89bf" + "id": "vOdS7TJUneHN" }, "outputs": [], "source": [ "EXPERIMENT_RUN = f\"trajectory-{get_id()}\"\n", "\n", "trajectory_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=trajectory_metrics,\n", - " experiment=EXPERIMENT_NAME\n", + " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", ")\n", "\n", - "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=agent_parsed_outcome,\n", - " experiment_run_name=EXPERIMENT_RUN)\n", + "trajectory_eval_result = trajectory_eval_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", "\n", "display_eval_report(trajectory_eval_result)" ] @@ -1101,22 +1025,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "executionInfo": { - "elapsed": 500, - "status": "ok", - "timestamp": 1734465076492, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "z7-LdM3mLBtk", - "outputId": "6bee7ab1-cf45-4f46-f35e-c1293cf53dfd" + "id": "z7-LdM3mLBtk" }, "outputs": [], "source": [ @@ -1127,26 +1036,15 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 542 - }, - "executionInfo": { - "elapsed": 215, - "status": "ok", - "timestamp": 1734465107714, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "sLVRdN5llA0h", - "outputId": "434d4447-c5d1-4e8c-fd8f-57a8c7e74d61" + "id": "sLVRdN5llA0h" }, "outputs": [], "source": [ - "plot_bar_plot(trajectory_eval_result, title=\"Trajectory Metrics\", metrics=[f'{metric}/mean' for metric in trajectory_metrics])" + "plot_bar_plot(\n", + " trajectory_eval_result,\n", + " title=\"Trajectory Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", + ")" ] }, { @@ -1170,8 +1068,7 @@ "\n", "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", "\n", - "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n", - "\n" + "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" ] }, { @@ -1182,9 +1079,7 @@ }, "outputs": [], "source": [ - "response_metrics = [\n", - " 'safety', 'coherence'\n", - "]" + "response_metrics = [\"safety\", \"coherence\"]" ] }, { @@ -1202,35 +1097,19 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 558 - }, - "executionInfo": { - "elapsed": 36837, - "status": "ok", - "timestamp": 1734465316039, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "wRb2EC_hknSD", - "outputId": "60585365-931e-4cef-98af-b88d2e2ec8c0" + "id": "wRb2EC_hknSD" }, "outputs": [], "source": [ "EXPERIMENT_RUN = f\"response-{get_id()}\"\n", "\n", "response_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=response_metrics,\n", - " experiment=EXPERIMENT_NAME\n", + " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", ")\n", "\n", - "response_eval_result = response_eval_task.evaluate(runnable=agent_parsed_outcome,\n", - " experiment_run_name=EXPERIMENT_RUN)\n", + "response_eval_result = response_eval_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", "\n", "display_eval_report(response_eval_result)" ] @@ -1251,22 +1130,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "executionInfo": { - "elapsed": 229, - "status": "ok", - "timestamp": 1734465218165, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "ZODTRuq2lF75", - "outputId": "cabe4dcf-d02f-4e5d-dcf8-103436751c38" + "id": "ZODTRuq2lF75" }, "outputs": [], "source": [ @@ -1296,8 +1160,7 @@ "\n", "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", "\n", - "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n", - "\n" + "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" ] }, { @@ -1317,7 +1180,7 @@ " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", " )\n", - " }\n", + "}\n", "\n", "pointwise_rating_rubric = {\n", " \"1\": \"Follows trajectory\",\n", @@ -1357,10 +1220,7 @@ "id": "e1djVp7Fi4Yy" }, "source": [ - "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n", - "\n", - "\n", - "\n" + "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" ] }, { @@ -1385,8 +1245,7 @@ "source": [ "#### Set response metrics\n", "\n", - "Set new generated response evaluation metrics by including the custom metric.\n", - "\n" + "Set new generated response evaluation metrics by including the custom metric.\n" ] }, { @@ -1398,7 +1257,10 @@ "outputs": [], "source": [ "response_tool_metrics = [\n", - " \"trajectory_exact_match\", \"trajectory_in_order_match\", \"safety\", response_follows_trajectory_metric\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"safety\",\n", + " response_follows_trajectory_metric,\n", "]" ] }, @@ -1417,22 +1279,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 804 - }, - "executionInfo": { - "elapsed": 38880, - "status": "ok", - "timestamp": 1734465423880, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "_dkb4gSn7Ywv", - "outputId": "2a7ff186-9648-4bb9-fa91-88ce10b2a54a" + "id": "_dkb4gSn7Ywv" }, "outputs": [], "source": [ @@ -1441,11 +1288,12 @@ "response_eval_tool_task = EvalTask(\n", " dataset=eval_sample_dataset,\n", " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME\n", + " experiment=EXPERIMENT_NAME,\n", ")\n", "\n", - "response_eval_tool_result = response_eval_tool_task.evaluate(runnable=agent_parsed_outcome,\n", - " experiment_run_name=EXPERIMENT_RUN)\n", + "response_eval_tool_result = response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", "\n", "display_eval_report(response_eval_tool_result)" ] @@ -1465,22 +1313,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "executionInfo": { - "elapsed": 247, - "status": "ok", - "timestamp": 1734465424124, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "GH2YvXgLlLH7", - "outputId": "4d7088d0-389d-4924-ff57-5de8db3dfb6b" + "id": "GH2YvXgLlLH7" }, "outputs": [], "source": [ @@ -1491,26 +1324,15 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 542 - }, - "executionInfo": { - "elapsed": 3, - "status": "ok", - "timestamp": 1734465424974, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "tdVhCURXMdLG", - "outputId": "9de4b57c-0fa8-44e0-a649-3ec37c57b6d9" + "id": "tdVhCURXMdLG" }, "outputs": [], "source": [ - "plot_bar_plot(response_eval_tool_result, title=\"Response Metrics\", metrics=[f'{metric}/mean' for metric in response_tool_metrics])" + "plot_bar_plot(\n", + " response_eval_tool_result,\n", + " title=\"Response Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", + ")" ] }, { @@ -1521,8 +1343,7 @@ "source": [ "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", "\n", - "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n", - "\n" + "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" ] }, { @@ -1545,41 +1366,94 @@ "outputs": [], "source": [ "byod_eval_data = {\n", - " 'prompt': [\n", + " \"prompt\": [\n", " \"Get price for smartphone\",\n", " \"Get product details and price for headphones\",\n", " \"Get details for usb charger\",\n", " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\"\n", + " \"Get product details for speaker?\",\n", " ],\n", - " 'reference_trajectory': [\n", - " [{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}],\n", + " \"reference_trajectory\": [\n", " [\n", - " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}},\n", - " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", " ],\n", - " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}],\n", " [\n", - " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'shoes'}},\n", - " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'shoes'}}\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", " ],\n", - " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'speaker'}}]\n", " ],\n", - " 'generated_trajectory': [\n", - " [{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}],\n", + " \"generated_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", " [\n", - " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}},\n", - " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", " ],\n", - " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}],\n", " [\n", - " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'shoes'}},\n", - " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'shoes'}}\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", " ],\n", - " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'speaker'}}]\n", " ],\n", - " 'response': [500, 50, 'A super fast and light usb charger', 100, 'A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.']\n", - "\n", + " \"response\": [\n", + " 500,\n", + " 50,\n", + " \"A super fast and light usb charger\",\n", + " 100,\n", + " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " ],\n", "}\n", "\n", "byod_eval_sample_dataset = pd.DataFrame(eval_data)" @@ -1600,22 +1474,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 804 - }, - "executionInfo": { - "elapsed": 41869, - "status": "ok", - "timestamp": 1734466280031, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "wBD-4wpB7q-3", - "outputId": "6928eeb6-902b-44b8-ca89-19ecce03df20" + "id": "wBD-4wpB7q-3" }, "outputs": [], "source": [ @@ -1624,11 +1483,12 @@ "byod_response_eval_tool_task = EvalTask(\n", " dataset=byod_eval_sample_dataset,\n", " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME\n", + " experiment=EXPERIMENT_NAME,\n", ")\n", "\n", - "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(runnable=agent_parsed_outcome,\n", - " experiment_run_name=EXPERIMENT_RUN_NAME)\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN_NAME\n", + ")\n", "\n", "display_eval_report(byod_response_eval_tool_result)" ] @@ -1648,22 +1508,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "executionInfo": { - "elapsed": 1576, - "status": "ok", - "timestamp": 1734466281336, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "pQFzmd2I7q-3", - "outputId": "410f6ee9-57f4-4304-c573-1a86b21e51af" + "id": "pQFzmd2I7q-3" }, "outputs": [], "source": [ @@ -1674,26 +1519,15 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 542 - }, - "executionInfo": { - "elapsed": 232, - "status": "ok", - "timestamp": 1734466361805, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "84HiPDOkPseW", - "outputId": "72d05905-3081-4bb6-a911-49796c9815a6" + "id": "84HiPDOkPseW" }, "outputs": [], "source": [ - "display_radar_plot(byod_response_eval_tool_result, title=\"Response Metrics\", metrics=[f'{metric}/mean' for metric in response_tool_metrics])" + "display_radar_plot(\n", + " byod_response_eval_tool_result,\n", + " title=\"Response Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", + ")" ] }, { @@ -1702,8 +1536,7 @@ "id": "2a4e033321ad" }, "source": [ - "## Cleaning up\n", - "\n" + "## Cleaning up\n" ] }, { @@ -1714,20 +1547,20 @@ }, "outputs": [], "source": [ - "delete_experiment=True\n", + "delete_experiment = True\n", "\n", "if delete_experiment:\n", " try:\n", - " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", - " experiment.delete(delete_backing_tensorboard_runs=True)\n", + " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", + " experiment.delete(delete_backing_tensorboard_runs=True)\n", " except Exception as e:\n", - " print(e)" + " print(e)" ] } ], "metadata": { "colab": { - "provenance": [], + "name": "evaluating_crewai_agent.ipynb", "toc_visible": true }, "environment": { diff --git a/gemini/evaluation/evaluating_langgraph_agent.ipynb b/gemini/evaluation/evaluating_langgraph_agent.ipynb index 095eebd601..c17a6ba09e 100644 --- a/gemini/evaluation/evaluating_langgraph_agent.ipynb +++ b/gemini/evaluation/evaluating_langgraph_agent.ipynb @@ -33,22 +33,22 @@ "\n", "
\n", - " \n", + " \n", " \"Google
Open in Colab\n", "
\n", "
\n", - " \n", + " \n", " \"Google
Open in Colab Enterprise\n", "
\n", "
\n", - " \n", + " \n", " \"Vertex
Open in Vertex AI Workbench\n", "
\n", "
\n", - " \n", + " \n", " \"GitHub
View on GitHub\n", "
\n", "
\n", " \n", " \n", " \n", " \n", @@ -58,23 +58,23 @@ "\n", "Share to:\n", "\n", - "\n", + "\n", " \"LinkedIn\n", "\n", "\n", - "\n", + "\n", " \"Bluesky\n", "\n", "\n", - "\n", + "\n", " \"X\n", "\n", "\n", - "\n", + "\n", " \"Reddit\n", "\n", "\n", - "\n", + "\n", " \"Facebook\n", "" ] @@ -114,8 +114,7 @@ "* Prepare Agent Evaluation dataset\n", "* Single tool usage evaluation\n", "* Trajectory evaluation\n", - "* Response evaluation\n", - "\n" + "* Response evaluation\n" ] }, { @@ -140,21 +139,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 22416, - "status": "ok", - "timestamp": 1734466630955, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "tFy3H3aPgx12", - "outputId": "14563449-48f5-4fe2-ef6b-07fb5a38df3d" + "id": "tFy3H3aPgx12" }, "outputs": [], "source": [ @@ -183,21 +168,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 223, - "status": "ok", - "timestamp": 1734466639485, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "XRvKdaPDTznN", - "outputId": "73a1ca2d-66c9-4f50-f6c0-a9eb0971ea2a" + "id": "XRvKdaPDTznN" }, "outputs": [], "source": [ @@ -262,22 +233,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 57 - }, - "executionInfo": { - "elapsed": 13286, - "status": "ok", - "timestamp": 1734466891717, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "Nqwi-5ufWp_B", - "outputId": "d12e3817-c999-459e-c98f-363fd917c24c" + "id": "Nqwi-5ufWp_B" }, "outputs": [], "source": [ @@ -293,7 +249,7 @@ "\n", "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", "\n", - "EXPERIMENT_NAME = \"evaluate-langgraph-agent\" # @param {type:\"string\"}\n", + "EXPERIMENT_NAME = \"evaluate-langgraph-agent\" # @param {type:\"string\"}\n", "\n", "vertexai.init(project=PROJECT_ID, location=LOCATION, experiment=EXPERIMENT_NAME)" ] @@ -320,23 +276,28 @@ "# General\n", "import random\n", "import string\n", - "import pandas as pd\n", - "import plotly.graph_objects as go\n", - "from IPython.display import display, Markdown, HTML\n", "from typing import Literal\n", "\n", + "from IPython.display import HTML, Markdown, display\n", + "\n", + "# Evaluate agent\n", + "from google.cloud import aiplatform\n", + "from langchain.load import dump as langchain_load_dump\n", + "\n", "# Build agent\n", "from langchain_core.messages import BaseMessage, HumanMessage\n", "from langchain_core.tools import tool\n", "from langchain_google_vertexai import ChatVertexAI\n", "from langgraph.graph import END, MessageGraph\n", "from langgraph.prebuilt import ToolNode\n", - "from langchain.load import dump as langchain_load_dump\n", - "\n", - "# Evaluate agent\n", - "from google.cloud import aiplatform\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", "from vertexai.preview.evaluation import EvalTask\n", - "from vertexai.preview.evaluation.metrics import TrajectorySingleToolUse, PointwiseMetricPromptTemplate, PointwiseMetric" + "from vertexai.preview.evaluation.metrics import (\n", + " PointwiseMetric,\n", + " PointwiseMetricPromptTemplate,\n", + " TrajectorySingleToolUse,\n", + ")" ] }, { @@ -362,87 +323,118 @@ " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", "\n", + "\n", "def parse_messages_to_output_dictionary(messages: list[dict]) -> dict:\n", " \"\"\"Parse response and function calls from a list of messages in the constructor format.\"\"\"\n", "\n", - " final_output = {'response': \"No AI response found in the message history.\",\n", - " 'predicted_trajectory': []}\n", + " final_output = {\n", + " \"response\": \"No AI response found in the message history.\",\n", + " \"predicted_trajectory\": [],\n", + " }\n", "\n", " # Process each message\n", " function_calls = []\n", " for message in messages:\n", " # Check if it's a Tool message which contains the actual response\n", - " if message.get('type') == 'constructor' and 'ToolMessage' in message.get('id', []):\n", - " final_output['response'] = message['kwargs']['content']\n", + " if message.get(\"type\") == \"constructor\" and \"ToolMessage\" in message.get(\n", + " \"id\", []\n", + " ):\n", + " final_output[\"response\"] = message[\"kwargs\"][\"content\"]\n", "\n", " # Check if it's an AI message to get tool calls\n", - " elif message.get('type') == 'constructor' and 'AIMessage' in message.get('id', []):\n", - " tool_calls = message['kwargs'].get('tool_calls', [])\n", + " elif message.get(\"type\") == \"constructor\" and \"AIMessage\" in message.get(\n", + " \"id\", []\n", + " ):\n", + " tool_calls = message[\"kwargs\"].get(\"tool_calls\", [])\n", " for tool_call in tool_calls:\n", " if tool_call:\n", - " function_calls.append({\n", - " \"tool_name\": tool_call.get('name'),\n", - " \"tool_input\": tool_call.get('args')\n", - " })\n", - "\n", - " final_output['predicted_trajectory'] = function_calls\n", + " function_calls.append(\n", + " {\n", + " \"tool_name\": tool_call.get(\"name\"),\n", + " \"tool_input\": tool_call.get(\"args\"),\n", + " }\n", + " )\n", + "\n", + " final_output[\"predicted_trajectory\"] = function_calls\n", " return final_output\n", "\n", + "\n", "def format_output_as_markdown(output: dict) -> str:\n", " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", " markdown = \"### AI Response\\n\"\n", " markdown += f\"{output['response']}\\n\\n\"\n", "\n", - " if output['predicted_trajectory']:\n", + " if output[\"predicted_trajectory\"]:\n", " markdown += \"### Function Calls\\n\"\n", - " for call in output['predicted_trajectory']:\n", + " for call in output[\"predicted_trajectory\"]:\n", " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", " markdown += \" - **Arguments**:\\n\"\n", - " for key, value in call['tool_input'].items():\n", + " for key, value in call[\"tool_input\"].items():\n", " markdown += f\" - `{key}`: `{value}`\\n\"\n", "\n", " return markdown\n", "\n", + "\n", "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", " \"\"\"Display the evaluation results.\"\"\"\n", - " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient='index').T\n", + " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", " display(Markdown(\"### Summary Metrics\"))\n", " display(metrics_df)\n", "\n", " display(Markdown(f\"### Row-wise Metrics\"))\n", " display(eval_result.metrics_table)\n", "\n", + "\n", "def display_drilldown(row: pd.Series) -> None:\n", " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", "\n", " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", "\n", - " if not (isinstance(row['predicted_trajectory'], list) and isinstance(row['reference_trajectory'], list)):\n", - " return\n", - "\n", - " for predicted_trajectory, reference_trajectory in zip(row['predicted_trajectory'], row['reference_trajectory']):\n", - " display(HTML(f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"))\n", - "\n", - "\n", - " if not (isinstance(predicted_trajectory.get('tool_input'), dict) and isinstance(reference_trajectory.get('tool_input'), dict)):\n", - " continue\n", - "\n", - "\n", - " for tool_input_key in predicted_trajectory['tool_input']:\n", - " print(\"Tool Input Key: \", tool_input_key)\n", + " if not (\n", + " isinstance(row[\"predicted_trajectory\"], list)\n", + " and isinstance(row[\"reference_trajectory\"], list)\n", + " ):\n", + " return\n", + "\n", + " for predicted_trajectory, reference_trajectory in zip(\n", + " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", + " ):\n", + " display(\n", + " HTML(\n", + " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", + " )\n", + " )\n", "\n", - " if tool_input_key in reference_trajectory['tool_input']:\n", - " print(\"Tool Values: \", predicted_trajectory['tool_input'][tool_input_key], reference_trajectory['tool_input'][tool_input_key])\n", - " else:\n", - " print(\"Tool Values: \", predicted_trajectory['tool_input'][tool_input_key], \"N/A\")\n", - " print(\"\\n\")\n", + " if not (\n", + " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", + " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", + " ):\n", + " continue\n", + "\n", + " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", + " print(\"Tool Input Key: \", tool_input_key)\n", + "\n", + " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " reference_trajectory[\"tool_input\"][tool_input_key],\n", + " )\n", + " else:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " \"N/A\",\n", + " )\n", + " print(\"\\n\")\n", " display(HTML(\"
\"))\n", "\n", + "\n", "def display_dataframe_rows(\n", " df: pd.DataFrame,\n", " columns: list[str] | None = None,\n", " num_rows: int = 3,\n", - " display_drilldown: bool = False\n", + " display_drilldown: bool = False,\n", ") -> None:\n", " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", "\n", @@ -454,13 +446,22 @@ "\n", " for _, row in df.head(num_rows).iterrows():\n", " for column in df.columns:\n", - " display(HTML(f\"{column.replace('_', ' ').title()}: \"))\n", + " display(\n", + " HTML(\n", + " f\"{column.replace('_', ' ').title()}: \"\n", + " )\n", + " )\n", " display(HTML(f\"{row[column]}
\"))\n", "\n", " display(HTML(\"
\"))\n", "\n", - " if display_drilldown and 'predicted_trajectory' in df.columns and 'reference_trajectory' in df.columns:\n", - " display_drilldown(row)\n", + " if (\n", + " display_drilldown\n", + " and \"predicted_trajectory\" in df.columns\n", + " and \"reference_trajectory\" in df.columns\n", + " ):\n", + " display_drilldown(row)\n", + "\n", "\n", "def plot_bar_plot(\n", " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", @@ -490,6 +491,7 @@ " fig.update_layout(barmode=\"group\")\n", " fig.show()\n", "\n", + "\n", "def display_radar_plot(eval_results, title: str, metrics=None):\n", " \"\"\"Plot the radar plot.\"\"\"\n", " fig = go.Figure()\n", @@ -515,7 +517,7 @@ " fig.update_layout(\n", " title=title,\n", " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", - " showlegend=True\n", + " showlegend=True,\n", " )\n", " fig.show()" ] @@ -595,7 +597,9 @@ }, "outputs": [], "source": [ - "def router(state: list[BaseMessage]) -> Literal[\"get_product_details\", \"get_product_price\", \"__end__\"]:\n", + "def router(\n", + " state: list[BaseMessage],\n", + ") -> Literal[\"get_product_details\", \"get_product_price\", \"__end__\"]:\n", " \"\"\"Initiates product details or price retrieval if the user asks for a product.\"\"\"\n", " # Get the tool_calls from the last message in the conversation history.\n", " tool_calls = state[-1].tool_calls\n", @@ -603,7 +607,7 @@ " # If there are any tool_calls\n", " if tool_calls:\n", " # Check the function name in the first tool call\n", - " function_name = tool_calls[0].get('name')\n", + " function_name = tool_calls[0].get(\"name\")\n", " if function_name == \"get_product_price\":\n", " return \"get_product_price\"\n", " else:\n", @@ -658,24 +662,24 @@ "source": [ "def agent_parsed_outcome(input):\n", "\n", - " model = ChatVertexAI(model=model)\n", - " builder = MessageGraph()\n", + " model = ChatVertexAI(model=model)\n", + " builder = MessageGraph()\n", "\n", - " model_with_tools = model.bind_tools([get_product_details, get_product_price])\n", - " builder.add_node(\"tools\", model_with_tools)\n", + " model_with_tools = model.bind_tools([get_product_details, get_product_price])\n", + " builder.add_node(\"tools\", model_with_tools)\n", "\n", - " tool_node = ToolNode([get_product_details, get_product_price])\n", - " builder.add_node(\"get_product_details\", tool_node)\n", - " builder.add_node(\"get_product_price\", tool_node)\n", - " builder.add_edge(\"get_product_details\", END)\n", - " builder.add_edge(\"get_product_price\", END)\n", + " tool_node = ToolNode([get_product_details, get_product_price])\n", + " builder.add_node(\"get_product_details\", tool_node)\n", + " builder.add_node(\"get_product_price\", tool_node)\n", + " builder.add_edge(\"get_product_details\", END)\n", + " builder.add_edge(\"get_product_price\", END)\n", "\n", - " builder.set_entry_point(\"tools\")\n", - " builder.add_conditional_edges(\"tools\", router)\n", + " builder.set_entry_point(\"tools\")\n", + " builder.add_conditional_edges(\"tools\", router)\n", "\n", - " app = builder.compile()\n", - " chat_history = langchain_load_dump.dumpd(app.invoke(HumanMessage(input)))\n", - " return parse_messages_to_output_dictionary(chat_history)" + " app = builder.compile()\n", + " chat_history = langchain_load_dump.dumpd(app.invoke(HumanMessage(input)))\n", + " return parse_messages_to_output_dictionary(chat_history)" ] }, { @@ -693,22 +697,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 190 - }, - "executionInfo": { - "elapsed": 1956, - "status": "ok", - "timestamp": 1734466947459, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "lGb58OJkjUs9", - "outputId": "3142f55a-324f-4a6b-b7fa-d7aee65efff4" + "id": "lGb58OJkjUs9" }, "outputs": [], "source": [ @@ -720,22 +709,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 190 - }, - "executionInfo": { - "elapsed": 1543, - "status": "ok", - "timestamp": 1734466950133, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "2wCFstt8w4Dx", - "outputId": "2ddf9be5-4a48-4c17-dd89-b2b4fc1698de" + "id": "2wCFstt8w4Dx" }, "outputs": [], "source": [ @@ -796,26 +770,50 @@ "outputs": [], "source": [ "eval_data = {\n", - " 'prompt': [\n", + " \"prompt\": [\n", " \"Get price for smartphone\",\n", " \"Get product details and price for headphones\",\n", " \"Get details for usb charger\",\n", " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\"\n", + " \"Get product details for speaker?\",\n", " ],\n", - " 'reference_trajectory': [\n", - " [{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", " [\n", - " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}},\n", - " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", " ],\n", - " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}],\n", " [\n", - " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'shoes'}},\n", - " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'shoes'}}\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", " ],\n", - " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'speaker'}}]\n", - " ]\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", "}\n", "\n", "eval_sample_dataset = pd.DataFrame(eval_data)" @@ -834,22 +832,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 273 - }, - "executionInfo": { - "elapsed": 249, - "status": "ok", - "timestamp": 1734467121881, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "EjsonqWWvIvE", - "outputId": "a3bcc38c-0888-4f0b-8863-b1fbb7b43e53" + "id": "EjsonqWWvIvE" }, "outputs": [], "source": [ @@ -864,8 +847,7 @@ "source": [ "### Single tool usage evaluation\n", "\n", - "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n", - "\n" + "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" ] }, { @@ -889,9 +871,7 @@ }, "outputs": [], "source": [ - "single_tool_usage_metrics = [\n", - " TrajectorySingleToolUse(tool_name='get_product_price')\n", - "]" + "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" ] }, { @@ -902,32 +882,14 @@ "source": [ "#### Run an evaluation task\n", "\n", - "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using agent_parsed_outcome function and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.\n", - "\n", - "\n", - "\n" + "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using agent_parsed_outcome function and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 647 - }, - "executionInfo": { - "elapsed": 14596, - "status": "ok", - "timestamp": 1734467151419, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "SRv43fDcd5by", - "outputId": "3a67f9ec-ad45-46f1-febc-909ccf6f9512" + "id": "SRv43fDcd5by" }, "outputs": [], "source": [ @@ -936,11 +898,12 @@ "single_tool_call_eval_task = EvalTask(\n", " dataset=eval_sample_dataset,\n", " metrics=single_tool_usage_metrics,\n", - " experiment=EXPERIMENT_NAME\n", + " experiment=EXPERIMENT_NAME,\n", ")\n", "\n", - "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(runnable=agent_parsed_outcome,\n", - " experiment_run_name=EXPERIMENT_RUN)\n", + "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", "\n", "display_eval_report(single_tool_call_eval_result)" ] @@ -960,22 +923,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 794 - }, - "executionInfo": { - "elapsed": 354, - "status": "ok", - "timestamp": 1734467151750, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "1Jopzw83k14w", - "outputId": "d0431283-5663-479d-c71f-8a9e2be005a7" + "id": "1Jopzw83k14w" }, "outputs": [], "source": [ @@ -1025,7 +973,11 @@ "outputs": [], "source": [ "trajectory_metrics = [\n", - " \"trajectory_exact_match\", \"trajectory_in_order_match\", \"trajectory_any_order_match\", \"trajectory_precision\", \"trajectory_recall\"\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"trajectory_any_order_match\",\n", + " \"trajectory_precision\",\n", + " \"trajectory_recall\",\n", "]" ] }, @@ -1044,35 +996,19 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 925 - }, - "executionInfo": { - "elapsed": 33448, - "status": "ok", - "timestamp": 1734467192781, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "vOdS7TJUneHN", - "outputId": "b089778d-5d04-4b66-aa55-606e3b234ddf" + "id": "vOdS7TJUneHN" }, "outputs": [], "source": [ "EXPERIMENT_RUN = f\"trajectory-{get_id()}\"\n", "\n", "trajectory_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=trajectory_metrics,\n", - " experiment=EXPERIMENT_NAME\n", + " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", ")\n", "\n", - "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=agent_parsed_outcome,\n", - " experiment_run_name=EXPERIMENT_RUN)\n", + "trajectory_eval_result = trajectory_eval_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", "\n", "display_eval_report(trajectory_eval_result)" ] @@ -1092,22 +1028,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "executionInfo": { - "elapsed": 245, - "status": "ok", - "timestamp": 1734467193023, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "sLVRdN5llA0h", - "outputId": "61897eaf-ebb3-4f8e-8c4c-c589aeef795c" + "id": "sLVRdN5llA0h" }, "outputs": [], "source": [ @@ -1118,26 +1039,15 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 542 - }, - "executionInfo": { - "elapsed": 483, - "status": "ok", - "timestamp": 1734467194033, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "erYYZEaaTNjJ", - "outputId": "69198916-d617-4116-d02d-42d1e44ad0c1" + "id": "erYYZEaaTNjJ" }, "outputs": [], "source": [ - "plot_bar_plot(trajectory_eval_result, title=\"Trajectory Metrics\", metrics=[f'{metric}/mean' for metric in trajectory_metrics])" + "plot_bar_plot(\n", + " trajectory_eval_result,\n", + " title=\"Trajectory Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", + ")" ] }, { @@ -1161,8 +1071,7 @@ "\n", "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", "\n", - "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n", - "\n" + "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" ] }, { @@ -1173,9 +1082,7 @@ }, "outputs": [], "source": [ - "response_metrics = [\n", - " 'safety', 'coherence'\n", - "]" + "response_metrics = [\"safety\", \"coherence\"]" ] }, { @@ -1193,35 +1100,19 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 734 - }, - "executionInfo": { - "elapsed": 22438, - "status": "ok", - "timestamp": 1734467221724, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "wRb2EC_hknSD", - "outputId": "ae416e8a-12ed-4522-f9c1-b7c966242a9e" + "id": "wRb2EC_hknSD" }, "outputs": [], "source": [ "EXPERIMENT_RUN = f\"response-{get_id()}\"\n", "\n", "response_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=response_metrics,\n", - " experiment=EXPERIMENT_NAME\n", + " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", ")\n", "\n", - "response_eval_result = response_eval_task.evaluate(runnable=agent_parsed_outcome,\n", - " experiment_run_name=EXPERIMENT_RUN)\n", + "response_eval_result = response_eval_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", "\n", "display_eval_report(response_eval_result)" ] @@ -1242,22 +1133,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "executionInfo": { - "elapsed": 571, - "status": "ok", - "timestamp": 1734467222292, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "ZODTRuq2lF75", - "outputId": "0b1f1517-9211-413f-ba7b-ae4742c5337c" + "id": "ZODTRuq2lF75" }, "outputs": [], "source": [ @@ -1287,8 +1163,7 @@ "\n", "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", "\n", - "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n", - "\n" + "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" ] }, { @@ -1308,7 +1183,7 @@ " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", " )\n", - " }\n", + "}\n", "\n", "pointwise_rating_rubric = {\n", " \"1\": \"Follows trajectory\",\n", @@ -1335,21 +1210,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 238, - "status": "ok", - "timestamp": 1734467234001, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "5EL7iEDMikNQ", - "outputId": "d9290dd7-bd93-4dd6-b8c9-e14c8a22d6e0" + "id": "5EL7iEDMikNQ" }, "outputs": [], "source": [ @@ -1362,10 +1223,7 @@ "id": "e1djVp7Fi4Yy" }, "source": [ - "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n", - "\n", - "\n", - "\n" + "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" ] }, { @@ -1390,8 +1248,7 @@ "source": [ "#### Set response metrics\n", "\n", - "Set new generated response evaluation metrics by including the custom metric.\n", - "\n" + "Set new generated response evaluation metrics by including the custom metric.\n" ] }, { @@ -1403,7 +1260,10 @@ "outputs": [], "source": [ "response_tool_metrics = [\n", - " \"trajectory_exact_match\", \"trajectory_in_order_match\", \"safety\", response_follows_trajectory_metric\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"safety\",\n", + " response_follows_trajectory_metric,\n", "]" ] }, @@ -1422,22 +1282,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 925 - }, - "executionInfo": { - "elapsed": 31118, - "status": "ok", - "timestamp": 1734467268809, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "_dkb4gSn7Ywv", - "outputId": "546e8a71-4161-40be-a61d-85f4031f07d7" + "id": "_dkb4gSn7Ywv" }, "outputs": [], "source": [ @@ -1446,11 +1291,12 @@ "response_eval_tool_task = EvalTask(\n", " dataset=eval_sample_dataset,\n", " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME\n", + " experiment=EXPERIMENT_NAME,\n", ")\n", "\n", - "response_eval_tool_result = response_eval_tool_task.evaluate(runnable=agent_parsed_outcome,\n", - " experiment_run_name=EXPERIMENT_RUN)\n", + "response_eval_tool_result = response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", "\n", "display_eval_report(response_eval_tool_result)" ] @@ -1470,22 +1316,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "executionInfo": { - "elapsed": 255, - "status": "ok", - "timestamp": 1734467269033, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "GH2YvXgLlLH7", - "outputId": "c1cca2ca-c91f-43af-f816-eb6c7231084f" + "id": "GH2YvXgLlLH7" }, "outputs": [], "source": [ @@ -1500,9 +1331,7 @@ "source": [ "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", "\n", - "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n", - "\n", - "\n" + "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" ] }, { @@ -1525,41 +1354,94 @@ "outputs": [], "source": [ "byod_eval_data = {\n", - " 'prompt': [\n", + " \"prompt\": [\n", " \"Get price for smartphone\",\n", " \"Get product details and price for headphones\",\n", " \"Get details for usb charger\",\n", " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\"\n", + " \"Get product details for speaker?\",\n", " ],\n", - " 'reference_trajectory': [\n", - " [{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", " [\n", - " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}},\n", - " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", " ],\n", - " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}],\n", " [\n", - " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'shoes'}},\n", - " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'shoes'}}\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", " ],\n", - " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'speaker'}}]\n", " ],\n", - " 'generated_trajectory': [\n", - " [{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}],\n", + " \"generated_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", " [\n", - " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}},\n", - " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", " ],\n", - " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}],\n", " [\n", - " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'shoes'}},\n", - " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'shoes'}}\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", " ],\n", - " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'speaker'}}]\n", " ],\n", - " 'response': [500, 50, 'A super fast and light usb charger', 100, 'A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.']\n", - "\n", + " \"response\": [\n", + " 500,\n", + " 50,\n", + " \"A super fast and light usb charger\",\n", + " 100,\n", + " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " ],\n", "}\n", "\n", "byod_eval_sample_dataset = pd.DataFrame(eval_data)" @@ -1580,22 +1462,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 925 - }, - "executionInfo": { - "elapsed": 31603, - "status": "ok", - "timestamp": 1734467316380, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "wBD-4wpB7q-3", - "outputId": "48539d5f-cfcc-490c-9361-001d9c5655fb" + "id": "wBD-4wpB7q-3" }, "outputs": [], "source": [ @@ -1604,11 +1471,12 @@ "byod_response_eval_tool_task = EvalTask(\n", " dataset=byod_eval_sample_dataset,\n", " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME\n", + " experiment=EXPERIMENT_NAME,\n", ")\n", "\n", - "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(runnable=agent_parsed_outcome,\n", - " experiment_run_name=EXPERIMENT_RUN_NAME)\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN_NAME\n", + ")\n", "\n", "display_eval_report(byod_response_eval_tool_result)" ] @@ -1628,22 +1496,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "executionInfo": { - "elapsed": 432, - "status": "ok", - "timestamp": 1734467316808, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "pQFzmd2I7q-3", - "outputId": "6291a671-5d4b-47fc-ccbb-7beecb681498" + "id": "pQFzmd2I7q-3" }, "outputs": [], "source": [ @@ -1654,26 +1507,15 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 542 - }, - "executionInfo": { - "elapsed": 3, - "status": "ok", - "timestamp": 1734467316808, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "DJr8GqQKTpUa", - "outputId": "763a002d-2170-4107-8588-9cfa0d462d63" + "id": "DJr8GqQKTpUa" }, "outputs": [], "source": [ - "display_radar_plot(byod_response_eval_tool_result, title=\"Response Metrics\", metrics=[f'{metric}/mean' for metric in response_tool_metrics])" + "display_radar_plot(\n", + " byod_response_eval_tool_result,\n", + " title=\"Response Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", + ")" ] }, { @@ -1682,46 +1524,31 @@ "id": "2a4e033321ad" }, "source": [ - "## Cleaning up\n", - "\n" + "## Cleaning up\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 16979, - "status": "ok", - "timestamp": 1734430207912, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "Ox2I3UfRlTOd", - "outputId": "c608c294-0311-42f9-aae4-ca40befd159c" + "id": "Ox2I3UfRlTOd" }, "outputs": [], "source": [ - "delete_experiment=True\n", + "delete_experiment = True\n", "\n", "if delete_experiment:\n", " try:\n", - " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", - " experiment.delete(delete_backing_tensorboard_runs=True)\n", + " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", + " experiment.delete(delete_backing_tensorboard_runs=True)\n", " except Exception as e:\n", - " print(e)" + " print(e)" ] } ], "metadata": { "colab": { - "provenance": [], + "name": "evaluating_langgraph_agent.ipynb", "toc_visible": true }, "environment": { diff --git a/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb b/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb index 146b976c63..645877010d 100644 --- a/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb +++ b/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb @@ -113,8 +113,7 @@ "* Prepare Agent Evaluation dataset\n", "* Single tool usage evaluation\n", "* Trajectory evaluation\n", - "* Response evaluation\n", - "\n" + "* Response evaluation\n" ] }, { @@ -139,21 +138,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 86407, - "status": "ok", - "timestamp": 1734509697919, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "tFy3H3aPgx12", - "outputId": "23790fd1-31d0-4617-ee06-0338947224e0" + "id": "tFy3H3aPgx12" }, "outputs": [], "source": [ @@ -181,21 +166,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 247, - "status": "ok", - "timestamp": 1734509760437, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "XRvKdaPDTznN", - "outputId": "11a3a8d8-dbbe-4ebb-b17b-bfbbabed07f9" + "id": "XRvKdaPDTznN" }, "outputs": [], "source": [ @@ -260,22 +231,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 112 - }, - "executionInfo": { - "elapsed": 9679, - "status": "ok", - "timestamp": 1734509790791, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "Nqwi-5ufWp_B", - "outputId": "e959f3e6-eca7-4d49-f757-e006b03e3ef1" + "id": "Nqwi-5ufWp_B" }, "outputs": [], "source": [ @@ -300,9 +256,14 @@ "\n", "! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI\n", "\n", - "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", + "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", "\n", - "vertexai.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI, experiment=EXPERIMENT_NAME)" + "vertexai.init(\n", + " project=PROJECT_ID,\n", + " location=LOCATION,\n", + " staging_bucket=BUCKET_URI,\n", + " experiment=EXPERIMENT_NAME,\n", + ")" ] }, { @@ -327,22 +288,25 @@ "# General\n", "import random\n", "import string\n", - "import json\n", - "import pandas as pd\n", - "import plotly.graph_objects as go\n", - "from IPython.display import display, Markdown, HTML\n", - "from typing import Callable, Sequence, TypedDict, Annotated, Literal\n", + "\n", + "from IPython.display import HTML, Markdown, display\n", "\n", "# Build agent\n", - "from crewai import Agent, Task, Crew, Process\n", - "from crewai_tools import tool\n", + "from crewai import Agent, Crew, Process, Task\n", "from crewai.flow.flow import Flow, listen, start\n", - "from vertexai.preview import reasoning_engines\n", + "from crewai_tools import tool\n", "\n", "# Evaluate agent\n", "from google.cloud import aiplatform\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "from vertexai.preview import reasoning_engines\n", "from vertexai.preview.evaluation import EvalTask\n", - "from vertexai.preview.evaluation.metrics import TrajectorySingleToolUse, PointwiseMetricPromptTemplate, PointwiseMetric" + "from vertexai.preview.evaluation.metrics import (\n", + " PointwiseMetric,\n", + " PointwiseMetricPromptTemplate,\n", + " TrajectorySingleToolUse,\n", + ")" ] }, { @@ -368,86 +332,106 @@ " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", "\n", + "\n", "def parse_crewai_output_to_dictionary(crew, crew_output):\n", " \"\"\"\n", " Parse CrewAI output into a structured dictionary format.\n", " \"\"\"\n", - " final_output = {\n", - " 'response': str(crew_output),\n", - " 'predicted_trajectory': []\n", - " }\n", + " final_output = {\"response\": str(crew_output), \"predicted_trajectory\": []}\n", "\n", " try:\n", " # Access tools_results directly from each agent\n", " for agent in crew.agents:\n", - " if hasattr(agent, 'tools_results'):\n", + " if hasattr(agent, \"tools_results\"):\n", " for tool_result in agent.tools_results:\n", " tool_info = {\n", - " 'tool_name': tool_result.get('tool_name', ''),\n", - " 'tool_input': tool_result.get('tool_args', {})\n", + " \"tool_name\": tool_result.get(\"tool_name\", \"\"),\n", + " \"tool_input\": tool_result.get(\"tool_args\", {}),\n", " }\n", - " final_output['predicted_trajectory'].append(tool_info)\n", + " final_output[\"predicted_trajectory\"].append(tool_info)\n", "\n", " except Exception as e:\n", - " final_output['error'] = f\"Error parsing tools results: {str(e)}\"\n", + " final_output[\"error\"] = f\"Error parsing tools results: {str(e)}\"\n", "\n", " return final_output\n", "\n", + "\n", "def format_output_as_markdown(output: dict) -> str:\n", " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", " markdown = \"### AI Response\\n\"\n", " markdown += f\"{output['response']}\\n\\n\"\n", "\n", - " if output['predicted_trajectory']:\n", + " if output[\"predicted_trajectory\"]:\n", " markdown += \"### Function Calls\\n\"\n", - " for call in output['predicted_trajectory']:\n", + " for call in output[\"predicted_trajectory\"]:\n", " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", " markdown += \" - **Arguments**:\\n\"\n", - " for key, value in call['tool_input'].items():\n", + " for key, value in call[\"tool_input\"].items():\n", " markdown += f\" - `{key}`: `{value}`\\n\"\n", "\n", " return markdown\n", "\n", + "\n", "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", " \"\"\"Display the evaluation results.\"\"\"\n", - " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient='index').T\n", + " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", " display(Markdown(\"### Summary Metrics\"))\n", " display(metrics_df)\n", "\n", " display(Markdown(f\"### Row-wise Metrics\"))\n", " display(eval_result.metrics_table)\n", "\n", + "\n", "def display_drilldown(row: pd.Series) -> None:\n", " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", "\n", " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", "\n", - " if not (isinstance(row['predicted_trajectory'], list) and isinstance(row['reference_trajectory'], list)):\n", - " return\n", - "\n", - " for predicted_trajectory, reference_trajectory in zip(row['predicted_trajectory'], row['reference_trajectory']):\n", - " display(HTML(f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"))\n", - "\n", - "\n", - " if not (isinstance(predicted_trajectory.get('tool_input'), dict) and isinstance(reference_trajectory.get('tool_input'), dict)):\n", - " continue\n", - "\n", + " if not (\n", + " isinstance(row[\"predicted_trajectory\"], list)\n", + " and isinstance(row[\"reference_trajectory\"], list)\n", + " ):\n", + " return\n", "\n", - " for tool_input_key in predicted_trajectory['tool_input']:\n", - " print(\"Tool Input Key: \", tool_input_key)\n", + " for predicted_trajectory, reference_trajectory in zip(\n", + " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", + " ):\n", + " display(\n", + " HTML(\n", + " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", + " )\n", + " )\n", "\n", - " if tool_input_key in reference_trajectory['tool_input']:\n", - " print(\"Tool Values: \", predicted_trajectory['tool_input'][tool_input_key], reference_trajectory['tool_input'][tool_input_key])\n", - " else:\n", - " print(\"Tool Values: \", predicted_trajectory['tool_input'][tool_input_key], \"N/A\")\n", - " print(\"\\n\")\n", + " if not (\n", + " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", + " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", + " ):\n", + " continue\n", + "\n", + " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", + " print(\"Tool Input Key: \", tool_input_key)\n", + "\n", + " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " reference_trajectory[\"tool_input\"][tool_input_key],\n", + " )\n", + " else:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " \"N/A\",\n", + " )\n", + " print(\"\\n\")\n", " display(HTML(\"
\"))\n", "\n", + "\n", "def display_dataframe_rows(\n", " df: pd.DataFrame,\n", " columns: list[str] | None = None,\n", " num_rows: int = 3,\n", - " display_drilldown: bool = False\n", + " display_drilldown: bool = False,\n", ") -> None:\n", " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", "\n", @@ -459,13 +443,22 @@ "\n", " for _, row in df.head(num_rows).iterrows():\n", " for column in df.columns:\n", - " display(HTML(f\"{column.replace('_', ' ').title()}: \"))\n", + " display(\n", + " HTML(\n", + " f\"{column.replace('_', ' ').title()}: \"\n", + " )\n", + " )\n", " display(HTML(f\"{row[column]}
\"))\n", "\n", " display(HTML(\"
\"))\n", "\n", - " if display_drilldown and 'predicted_trajectory' in df.columns and 'reference_trajectory' in df.columns:\n", - " display_drilldown(row)\n", + " if (\n", + " display_drilldown\n", + " and \"predicted_trajectory\" in df.columns\n", + " and \"reference_trajectory\" in df.columns\n", + " ):\n", + " display_drilldown(row)\n", + "\n", "\n", "def plot_bar_plot(\n", " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", @@ -495,6 +488,7 @@ " fig.update_layout(barmode=\"group\")\n", " fig.show()\n", "\n", + "\n", "def display_radar_plot(eval_results, title: str, metrics=None):\n", " \"\"\"Plot the radar plot.\"\"\"\n", " fig = go.Figure()\n", @@ -520,7 +514,7 @@ " fig.update_layout(\n", " title=title,\n", " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", - " showlegend=True\n", + " showlegend=True,\n", " )\n", " fig.show()" ] @@ -533,8 +527,7 @@ "source": [ "## Build an agent using Vertex AI Reasoning Engine's customized template\n", "\n", - "Build your application using CrewAI, including the Gemini model and custom tools that you define.\n", - "\n" + "Build your application using CrewAI, including the Gemini model and custom tools that you define.\n" ] }, { @@ -615,7 +608,7 @@ " tool_calls = last_message.get(\"tool_calls\", [])\n", "\n", " if tool_calls:\n", - " function_name = tool_calls[0].get('name')\n", + " function_name = tool_calls[0].get(\"name\")\n", " if function_name == \"get_product_price\":\n", " return \"get_product_price\"\n", " else:\n", @@ -677,25 +670,25 @@ " # The set_up method is used to define application initialization logic\n", " def set_up(self) -> None:\n", " \"\"\"Set up the application.\"\"\"\n", - " os.environ['GOOGLE_CLOUD_PROJECT'] = self.project_id\n", + " os.environ[\"GOOGLE_CLOUD_PROJECT\"] = self.project_id\n", " return\n", "\n", " # The query method will be used to send inputs to the agent\n", " def query(self, input: str):\n", " \"\"\"Query the application.\"\"\"\n", " product_researcher = Agent(\n", - " role='Product Researcher',\n", - " goal='Research product details and prices accurately',\n", - " backstory='Expert at gathering and analyzing product information',\n", - " llm=model,\n", - " tools=[get_product_details, get_product_price],\n", - " allow_delegation=False\n", + " role=\"Product Researcher\",\n", + " goal=\"Research product details and prices accurately\",\n", + " backstory=\"Expert at gathering and analyzing product information\",\n", + " llm=model,\n", + " tools=[get_product_details, get_product_price],\n", + " allow_delegation=False,\n", " )\n", "\n", " research_task = Task(\n", " description=f\"Analyze this user request: '{input}'. \"\n", - " f\"If the request is about price, use get_product_price tool. \"\n", - " f\"Otherwise, use get_product_details tool to get product information.\",\n", + " f\"If the request is about price, use get_product_price tool. \"\n", + " f\"Otherwise, use get_product_details tool to get product information.\",\n", " expected_output=\"Product information including details and/or price based on the user request.\",\n", " agent=product_researcher,\n", " )\n", @@ -737,22 +730,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 190 - }, - "executionInfo": { - "elapsed": 7192, - "status": "ok", - "timestamp": 1734510146357, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "PgkOhPmN3aCZ", - "outputId": "6e2752f9-b237-4f0b-e04a-33fb0e7be373" + "id": "PgkOhPmN3aCZ" }, "outputs": [], "source": [ @@ -764,22 +742,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 207 - }, - "executionInfo": { - "elapsed": 3279, - "status": "ok", - "timestamp": 1734510149635, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "lGb58OJkjUs9", - "outputId": "ba74d03c-364c-42e9-847e-8d819f19836b" + "id": "lGb58OJkjUs9" }, "outputs": [], "source": [ @@ -806,21 +769,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 413622, - "status": "ok", - "timestamp": 1734510599752, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "3HLz_a1We4QE", - "outputId": "70a0a6f2-5891-4e57-ce34-2da339e4978c" + "id": "3HLz_a1We4QE" }, "outputs": [], "source": [ @@ -854,22 +803,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 190 - }, - "executionInfo": { - "elapsed": 5343, - "status": "ok", - "timestamp": 1734510633271, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "sqBtzYyce4QE", - "outputId": "ebef6581-b843-47e4-cfe5-f95e6ba80f4f" + "id": "sqBtzYyce4QE" }, "outputs": [], "source": [ @@ -931,26 +865,50 @@ "outputs": [], "source": [ "eval_data = {\n", - " 'prompt': [\n", + " \"prompt\": [\n", " \"Get price for smartphone\",\n", " \"Get product details and price for headphones\",\n", " \"Get details for usb charger\",\n", " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\"\n", + " \"Get product details for speaker?\",\n", " ],\n", - " 'reference_trajectory': [\n", - " [{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", " [\n", - " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}},\n", - " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", " ],\n", - " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}],\n", " [\n", - " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'shoes'}},\n", - " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'shoes'}}\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", " ],\n", - " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'speaker'}}]\n", - " ]\n", + " ],\n", "}\n", "\n", "eval_sample_dataset = pd.DataFrame(eval_data)" @@ -969,22 +927,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 273 - }, - "executionInfo": { - "elapsed": 331, - "status": "ok", - "timestamp": 1734510641426, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "EjsonqWWvIvE", - "outputId": "5cd251f4-8990-4b92-f8f9-1b3d2b97e626" + "id": "EjsonqWWvIvE" }, "outputs": [], "source": [ @@ -1011,11 +954,11 @@ "outputs": [], "source": [ "def agent_parsed_response(input: str) -> dict:\n", - " \"\"\"Pass the agent output to Vertex AI Gen AI Evaluation.\"\"\"\n", + " \"\"\"Pass the agent output to Vertex AI Gen AI Evaluation.\"\"\"\n", "\n", - " result = remote_custom_agent.query(input=input)\n", + " result = remote_custom_agent.query(input=input)\n", "\n", - " return result" + " return result" ] }, { @@ -1026,8 +969,7 @@ "source": [ "### Single tool usage evaluation\n", "\n", - "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n", - "\n" + "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" ] }, { @@ -1051,9 +993,7 @@ }, "outputs": [], "source": [ - "single_tool_usage_metrics = [\n", - " TrajectorySingleToolUse(tool_name='get_product_price')\n", - "]" + "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" ] }, { @@ -1064,10 +1004,7 @@ "source": [ "#### Run an evaluation task\n", "\n", - "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using a local agent (local_1p_agent) and assigns a unique identifier to this specific evaluation run, storing the evaluation results.\n", - "\n", - "\n", - "\n" + "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using a local agent (local_1p_agent) and assigns a unique identifier to this specific evaluation run, storing the evaluation results.\n" ] }, { @@ -1085,33 +1022,19 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 700 - }, - "executionInfo": { - "elapsed": 35014, - "status": "ok", - "timestamp": 1734510766421, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "SRv43fDcd5by", - "outputId": "47bd1ae5-d484-4c36-ccf6-0e79ab83c108" + "id": "SRv43fDcd5by" }, "outputs": [], "source": [ "single_tool_call_eval_task = EvalTask(\n", " dataset=eval_sample_dataset,\n", " metrics=single_tool_usage_metrics,\n", - " experiment=EXPERIMENT_NAME\n", + " experiment=EXPERIMENT_NAME,\n", ")\n", "\n", - "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(runnable=agent_parsed_response,\n", - " experiment_run_name=EXPERIMENT_RUN)\n", + "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", + " runnable=agent_parsed_response, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", "\n", "display_eval_report(single_tool_call_eval_result)" ] @@ -1131,22 +1054,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 794 - }, - "executionInfo": { - "elapsed": 233, - "status": "ok", - "timestamp": 1734510771445, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "ZkpwPReipekr", - "outputId": "bf3e0f46-e2b5-4270-9d2f-c576e1f52519" + "id": "ZkpwPReipekr" }, "outputs": [], "source": [ @@ -1196,7 +1104,11 @@ "outputs": [], "source": [ "trajectory_metrics = [\n", - " \"trajectory_exact_match\", \"trajectory_in_order_match\", \"trajectory_any_order_match\", \"trajectory_precision\", \"trajectory_recall\"\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"trajectory_any_order_match\",\n", + " \"trajectory_precision\",\n", + " \"trajectory_recall\",\n", "]" ] }, @@ -1215,31 +1127,14 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 925 - }, - "executionInfo": { - "elapsed": 41311, - "status": "ok", - "timestamp": 1734510817099, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "vOdS7TJUneHN", - "outputId": "51ece05f-b9ff-46e0-b99f-f9b2f4be5c31" + "id": "vOdS7TJUneHN" }, "outputs": [], "source": [ - "EXPERIMENT_NAME = \"evaluate-re-agent-trajectory\" # @param {type:\"string\"}\n", + "EXPERIMENT_NAME = \"evaluate-re-agent-trajectory\" # @param {type:\"string\"}\n", "\n", "trajectory_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=trajectory_metrics,\n", - " experiment=EXPERIMENT_NAME\n", + " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", ")\n", "\n", "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=agent_parsed_response)\n", @@ -1262,22 +1157,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "executionInfo": { - "elapsed": 243, - "status": "ok", - "timestamp": 1734510817339, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "sLVRdN5llA0h", - "outputId": "588eee63-64eb-4f47-9bc8-41978fcef599" + "id": "sLVRdN5llA0h" }, "outputs": [], "source": [ @@ -1288,26 +1168,15 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 542 - }, - "executionInfo": { - "elapsed": 459, - "status": "ok", - "timestamp": 1734510817797, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "PrxM5sMZYXHP", - "outputId": "35a593ee-c99f-4b74-81b8-05a53d80263c" + "id": "PrxM5sMZYXHP" }, "outputs": [], "source": [ - "plot_bar_plot(trajectory_eval_result, title=\"Trajectory Metrics\", metrics=[f'{metric}/mean' for metric in trajectory_metrics])" + "plot_bar_plot(\n", + " trajectory_eval_result,\n", + " title=\"Trajectory Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", + ")" ] }, { @@ -1331,8 +1200,7 @@ "\n", "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", "\n", - "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n", - "\n" + "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" ] }, { @@ -1343,9 +1211,7 @@ }, "outputs": [], "source": [ - "response_metrics = [\n", - " 'safety', 'coherence'\n", - "]" + "response_metrics = [\"safety\", \"coherence\"]" ] }, { @@ -1363,31 +1229,14 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 943 - }, - "executionInfo": { - "elapsed": 31734, - "status": "ok", - "timestamp": 1734510849530, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "wRb2EC_hknSD", - "outputId": "d5c6f65c-405a-463f-bb38-01112871c66d" + "id": "wRb2EC_hknSD" }, "outputs": [], "source": [ - "EXPERIMENT_NAME = \"evaluate-re-agent-response\" # @param {type:\"string\"}\n", + "EXPERIMENT_NAME = \"evaluate-re-agent-response\" # @param {type:\"string\"}\n", "\n", "response_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=response_metrics,\n", - " experiment=EXPERIMENT_NAME\n", + " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", ")\n", "\n", "response_eval_result = response_eval_task.evaluate(runnable=agent_parsed_response)\n", @@ -1411,22 +1260,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "executionInfo": { - "elapsed": 276, - "status": "ok", - "timestamp": 1734510849797, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "cy0aRydrp9zW", - "outputId": "ed3add5b-03ef-4591-8d2c-eead865c017b" + "id": "cy0aRydrp9zW" }, "outputs": [], "source": [ @@ -1456,8 +1290,7 @@ "\n", "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", "\n", - "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n", - "\n" + "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" ] }, { @@ -1477,7 +1310,7 @@ " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", " )\n", - " }\n", + "}\n", "\n", "pointwise_rating_rubric = {\n", " \"1\": \"Follows trajectory\",\n", @@ -1504,21 +1337,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 3, - "status": "ok", - "timestamp": 1734510849797, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "5EL7iEDMikNQ", - "outputId": "500f3026-cb93-44b6-d5fb-d96ab863444a" + "id": "5EL7iEDMikNQ" }, "outputs": [], "source": [ @@ -1531,10 +1350,7 @@ "id": "e1djVp7Fi4Yy" }, "source": [ - "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n", - "\n", - "\n", - "\n" + "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" ] }, { @@ -1559,8 +1375,7 @@ "source": [ "#### Set response metrics\n", "\n", - "Set new generated response evaluation metrics by including the custom metric.\n", - "\n" + "Set new generated response evaluation metrics by including the custom metric.\n" ] }, { @@ -1572,7 +1387,10 @@ "outputs": [], "source": [ "response_tool_metrics = [\n", - " \"trajectory_exact_match\", \"trajectory_in_order_match\", \"safety\", response_follows_trajectory_metric\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"safety\",\n", + " response_follows_trajectory_metric,\n", "]" ] }, @@ -1591,34 +1409,21 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 925 - }, - "executionInfo": { - "elapsed": 41502, - "status": "ok", - "timestamp": 1734510891298, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "_dkb4gSn7Ywv", - "outputId": "02d10b75-d728-4520-ed10-bde10a1da2dc" + "id": "_dkb4gSn7Ywv" }, "outputs": [], "source": [ - "EXPERIMENT_NAME = \"evaluate-re-agent-response-by-tools\" # @param {type:\"string\"}\n", + "EXPERIMENT_NAME = \"evaluate-re-agent-response-by-tools\" # @param {type:\"string\"}\n", "\n", "response_eval_tool_task = EvalTask(\n", " dataset=eval_sample_dataset,\n", " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME\n", + " experiment=EXPERIMENT_NAME,\n", ")\n", "\n", - "response_eval_tool_result = response_eval_tool_task.evaluate(runnable=agent_parsed_response)\n", + "response_eval_tool_result = response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_response\n", + ")\n", "\n", "display_eval_report(response_eval_tool_result)" ] @@ -1639,22 +1444,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "executionInfo": { - "elapsed": 377, - "status": "ok", - "timestamp": 1734510891610, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "ZODTRuq2lF75", - "outputId": "68651fed-5017-4ed1-a85e-3e2ec3b49c05" + "id": "ZODTRuq2lF75" }, "outputs": [], "source": [ @@ -1669,8 +1459,7 @@ "source": [ "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", "\n", - "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n", - "\n" + "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" ] }, { @@ -1693,41 +1482,94 @@ "outputs": [], "source": [ "byod_eval_data = {\n", - " 'prompt': [\n", + " \"prompt\": [\n", " \"Get price for smartphone\",\n", " \"Get product details and price for headphones\",\n", " \"Get details for usb charger\",\n", " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\"\n", + " \"Get product details for speaker?\",\n", " ],\n", - " 'reference_trajectory': [\n", - " [{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}],\n", + " \"reference_trajectory\": [\n", " [\n", - " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}},\n", - " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", " ],\n", - " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}],\n", " [\n", - " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'shoes'}},\n", - " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'shoes'}}\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", " ],\n", - " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'speaker'}}]\n", " ],\n", - " 'generated_trajectory': [\n", - " [{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}],\n", + " \"generated_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", " [\n", - " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}},\n", - " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", " ],\n", - " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}],\n", " [\n", - " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'shoes'}},\n", - " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'shoes'}}\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", " ],\n", - " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'speaker'}}]\n", " ],\n", - " 'response': [500, 50, 'A super fast and light usb charger', 100, 'A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.']\n", - "\n", + " \"response\": [\n", + " 500,\n", + " 50,\n", + " \"A super fast and light usb charger\",\n", + " 100,\n", + " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " ],\n", "}\n", "\n", "byod_eval_sample_dataset = pd.DataFrame(eval_data)" @@ -1748,22 +1590,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 925 - }, - "executionInfo": { - "elapsed": 40090, - "status": "ok", - "timestamp": 1734510931695, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "wBD-4wpB7q-3", - "outputId": "082a5d56-c5ad-4a3b-8f2b-8f3902d2892d" + "id": "wBD-4wpB7q-3" }, "outputs": [], "source": [ @@ -1772,11 +1599,12 @@ "byod_response_eval_tool_task = EvalTask(\n", " dataset=byod_eval_sample_dataset,\n", " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME\n", + " experiment=EXPERIMENT_NAME,\n", ")\n", "\n", - "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(runnable=agent_parsed_response,\n", - " experiment_run_name=EXPERIMENT_RUN_NAME)\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_response, experiment_run_name=EXPERIMENT_RUN_NAME\n", + ")\n", "\n", "display_eval_report(byod_response_eval_tool_result)" ] @@ -1796,22 +1624,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "executionInfo": { - "elapsed": 517, - "status": "ok", - "timestamp": 1734510932184, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "pQFzmd2I7q-3", - "outputId": "49c8127b-591c-4880-8380-3595dff0b52c" + "id": "pQFzmd2I7q-3" }, "outputs": [], "source": [ @@ -1822,26 +1635,15 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 542 - }, - "executionInfo": { - "elapsed": 10, - "status": "ok", - "timestamp": 1734510932184, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "0FEbvEOkZS8f", - "outputId": "d0a8195f-dca9-4ff9-e48c-e81b172bec1a" + "id": "0FEbvEOkZS8f" }, "outputs": [], "source": [ - "display_radar_plot(byod_response_eval_tool_result, title=\"Response Metrics\", metrics=[f'{metric}/mean' for metric in response_tool_metrics])" + "display_radar_plot(\n", + " byod_response_eval_tool_result,\n", + " title=\"Response Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", + ")" ] }, { @@ -1850,8 +1652,7 @@ "id": "2a4e033321ad" }, "source": [ - "## Cleaning up\n", - "\n" + "## Cleaning up\n" ] }, { @@ -1862,27 +1663,27 @@ }, "outputs": [], "source": [ - "delete_experiment=True\n", - "delete_remote_agent=True\n", + "delete_experiment = True\n", + "delete_remote_agent = True\n", "\n", "if delete_experiment:\n", " try:\n", - " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", - " experiment.delete(delete_backing_tensorboard_runs=True)\n", + " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", + " experiment.delete(delete_backing_tensorboard_runs=True)\n", " except Exception as e:\n", - " print(e)\n", + " print(e)\n", "\n", "if delete_remote_agent:\n", " try:\n", - " remote_custom_agent.delete()\n", + " remote_custom_agent.delete()\n", " except Exception as e:\n", - " print(e)" + " print(e)" ] } ], "metadata": { "colab": { - "provenance": [], + "name": "evaluating_crewai_agent_reasoning_engine_customized_template.ipynb", "toc_visible": true }, "environment": { diff --git a/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb b/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb index cb2d983ab2..7db11bfc2d 100644 --- a/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb +++ b/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb @@ -138,21 +138,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 22383, - "status": "ok", - "timestamp": 1734467377714, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "tFy3H3aPgx12", - "outputId": "14fde80a-429e-4894-d521-704678805ab9" + "id": "tFy3H3aPgx12" }, "outputs": [], "source": [ @@ -180,21 +166,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 252, - "status": "ok", - "timestamp": 1734467630727, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "XRvKdaPDTznN", - "outputId": "f193ca30-cb73-400d-84c3-ce1842144ef0" + "id": "XRvKdaPDTznN" }, "outputs": [], "source": [ @@ -259,22 +231,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 112 - }, - "executionInfo": { - "elapsed": 14164, - "status": "ok", - "timestamp": 1734467672418, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "Nqwi-5ufWp_B", - "outputId": "649d75fc-01a5-45d4-8e36-f538205e8374" + "id": "Nqwi-5ufWp_B" }, "outputs": [], "source": [ @@ -298,9 +255,14 @@ "\n", "! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI\n", "\n", - "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", + "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", "\n", - "vertexai.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI, experiment=EXPERIMENT_NAME)" + "vertexai.init(\n", + " project=PROJECT_ID,\n", + " location=LOCATION,\n", + " staging_bucket=BUCKET_URI,\n", + " experiment=EXPERIMENT_NAME,\n", + ")" ] }, { @@ -325,17 +287,22 @@ "# General\n", "import random\n", "import string\n", - "from IPython.display import display, Markdown, HTML\n", - "import pandas as pd\n", - "import plotly.graph_objects as go\n", + "\n", + "from IPython.display import HTML, Markdown, display\n", "\n", "# Build agent\n", "from google.cloud import aiplatform\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", "from vertexai.preview import reasoning_engines\n", "\n", "# Evaluate agent\n", "from vertexai.preview.evaluation import EvalTask\n", - "from vertexai.preview.evaluation.metrics import TrajectorySingleToolUse, PointwiseMetricPromptTemplate, PointwiseMetric" + "from vertexai.preview.evaluation.metrics import (\n", + " PointwiseMetric,\n", + " PointwiseMetricPromptTemplate,\n", + " TrajectorySingleToolUse,\n", + ")" ] }, { @@ -361,46 +328,67 @@ " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", "\n", + "\n", "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", " \"\"\"Display the evaluation results.\"\"\"\n", - " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient='index').T\n", + " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", " display(Markdown(\"### Summary Metrics\"))\n", " display(metrics_df)\n", "\n", " display(Markdown(f\"### Row-wise Metrics\"))\n", " display(eval_result.metrics_table)\n", "\n", + "\n", "def display_drilldown(row: pd.Series) -> None:\n", " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", "\n", " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", "\n", - " if not (isinstance(row['predicted_trajectory'], list) and isinstance(row['reference_trajectory'], list)):\n", - " return\n", - "\n", - " for predicted_trajectory, reference_trajectory in zip(row['predicted_trajectory'], row['reference_trajectory']):\n", - " display(HTML(f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"))\n", - "\n", - "\n", - " if not (isinstance(predicted_trajectory.get('tool_input'), dict) and isinstance(reference_trajectory.get('tool_input'), dict)):\n", - " continue\n", - "\n", - "\n", - " for tool_input_key in predicted_trajectory['tool_input']:\n", - " print(\"Tool Input Key: \", tool_input_key)\n", + " if not (\n", + " isinstance(row[\"predicted_trajectory\"], list)\n", + " and isinstance(row[\"reference_trajectory\"], list)\n", + " ):\n", + " return\n", + "\n", + " for predicted_trajectory, reference_trajectory in zip(\n", + " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", + " ):\n", + " display(\n", + " HTML(\n", + " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", + " )\n", + " )\n", "\n", - " if tool_input_key in reference_trajectory['tool_input']:\n", - " print(\"Tool Values: \", predicted_trajectory['tool_input'][tool_input_key], reference_trajectory['tool_input'][tool_input_key])\n", - " else:\n", - " print(\"Tool Values: \", predicted_trajectory['tool_input'][tool_input_key], \"N/A\")\n", - " print(\"\\n\")\n", + " if not (\n", + " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", + " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", + " ):\n", + " continue\n", + "\n", + " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", + " print(\"Tool Input Key: \", tool_input_key)\n", + "\n", + " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " reference_trajectory[\"tool_input\"][tool_input_key],\n", + " )\n", + " else:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " \"N/A\",\n", + " )\n", + " print(\"\\n\")\n", " display(HTML(\"
\"))\n", "\n", + "\n", "def display_dataframe_rows(\n", " df: pd.DataFrame,\n", " columns: list[str] | None = None,\n", " num_rows: int = 3,\n", - " display_drilldown: bool = False\n", + " display_drilldown: bool = False,\n", ") -> None:\n", " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", "\n", @@ -412,13 +400,22 @@ "\n", " for _, row in df.head(num_rows).iterrows():\n", " for column in df.columns:\n", - " display(HTML(f\"{column.replace('_', ' ').title()}: \"))\n", + " display(\n", + " HTML(\n", + " f\"{column.replace('_', ' ').title()}: \"\n", + " )\n", + " )\n", " display(HTML(f\"{row[column]}
\"))\n", "\n", " display(HTML(\"
\"))\n", "\n", - " if display_drilldown and 'predicted_trajectory' in df.columns and 'reference_trajectory' in df.columns:\n", - " display_drilldown(row)\n", + " if (\n", + " display_drilldown\n", + " and \"predicted_trajectory\" in df.columns\n", + " and \"reference_trajectory\" in df.columns\n", + " ):\n", + " display_drilldown(row)\n", + "\n", "\n", "def plot_bar_plot(\n", " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", @@ -448,6 +445,7 @@ " fig.update_layout(barmode=\"group\")\n", " fig.show()\n", "\n", + "\n", "def display_radar_plot(eval_results, title: str, metrics=None):\n", " \"\"\"Plot the radar plot.\"\"\"\n", " fig = go.Figure()\n", @@ -473,7 +471,7 @@ " fig.update_layout(\n", " title=title,\n", " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", - " showlegend=True\n", + " showlegend=True,\n", " )\n", " fig.show()" ] @@ -486,8 +484,7 @@ "source": [ "## Build and deploy a LangChain agent using Vertex AI Reasoning Engine's prebuilt template\n", "\n", - "Build and deploy your application using LangChain, including the Gemini model and custom tools that you define.\n", - "\n" + "Build and deploy your application using LangChain, including the Gemini model and custom tools that you define.\n" ] }, { @@ -520,6 +517,7 @@ " }\n", " return details.get(product_name, \"Product details not found.\")\n", "\n", + "\n", "def get_product_price(product_name: str):\n", " \"\"\"Gathers price about a product.\"\"\"\n", " details = {\n", @@ -580,7 +578,7 @@ "local_1p_agent = reasoning_engines.LangchainAgent(\n", " model=model,\n", " tools=[get_product_details, get_product_price],\n", - " agent_executor_kwargs={\"return_intermediate_steps\": True}\n", + " agent_executor_kwargs={\"return_intermediate_steps\": True},\n", ")" ] }, @@ -599,54 +597,24 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 46 - }, - "executionInfo": { - "elapsed": 1940, - "status": "ok", - "timestamp": 1734467973841, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "lGb58OJkjUs9", - "outputId": "2089190f-2428-4dcb-8eb2-5214285344d3" + "id": "lGb58OJkjUs9" }, "outputs": [], "source": [ "response = local_1p_agent.query(input=\"Get product details for shoes\")\n", - "display(Markdown(response['output']))" + "display(Markdown(response[\"output\"]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 46 - }, - "executionInfo": { - "elapsed": 1897, - "status": "ok", - "timestamp": 1734468011919, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "INqf60zPWP6L", - "outputId": "71b96760-e68a-47c4-80ad-1536253a0673" + "id": "INqf60zPWP6L" }, "outputs": [], "source": [ "response = local_1p_agent.query(input=\"Get product price for shoes\")\n", - "display(Markdown(response['output']))" + "display(Markdown(response[\"output\"]))" ] }, { @@ -668,21 +636,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 231458, - "status": "ok", - "timestamp": 1734468270309, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "GPNpD676r6T2", - "outputId": "02f04032-a8d6-4f04-936f-22f9b9110875" + "id": "GPNpD676r6T2" }, "outputs": [], "source": [ @@ -713,27 +667,12 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 46 - }, - "executionInfo": { - "elapsed": 2553, - "status": "ok", - "timestamp": 1734468272840, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "KSCznbhbHRh3", - "outputId": "9a8cb4bc-a266-456a-f4ea-2c6ccecadad8" + "id": "KSCznbhbHRh3" }, "outputs": [], "source": [ "response = remote_1p_agent.query(input=\"Get product details for shoes\")\n", - "display(Markdown(response['output']))" + "display(Markdown(response[\"output\"]))" ] }, { @@ -789,26 +728,50 @@ "outputs": [], "source": [ "eval_data = {\n", - " 'prompt': [\n", + " \"prompt\": [\n", " \"Get price for smartphone\",\n", " \"Get product details and price for headphones\",\n", " \"Get details for usb charger\",\n", " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\"\n", + " \"Get product details for speaker?\",\n", " ],\n", - " 'reference_trajectory': [\n", - " [{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", " [\n", - " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}},\n", - " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", " ],\n", - " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}],\n", " [\n", - " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'shoes'}},\n", - " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'shoes'}}\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", " ],\n", - " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'speaker'}}]\n", - " ]\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", "}\n", "\n", "eval_sample_dataset = pd.DataFrame(eval_data)" @@ -827,22 +790,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 273 - }, - "executionInfo": { - "elapsed": 435, - "status": "ok", - "timestamp": 1734468312746, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "EjsonqWWvIvE", - "outputId": "b989c414-d83a-42ca-f7d9-508e0aaaca9f" + "id": "EjsonqWWvIvE" }, "outputs": [], "source": [ @@ -857,8 +805,7 @@ "source": [ "### Single tool usage evaluation\n", "\n", - "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n", - "\n" + "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" ] }, { @@ -882,9 +829,7 @@ }, "outputs": [], "source": [ - "single_tool_usage_metrics = [\n", - " TrajectorySingleToolUse(tool_name='get_product_price')\n", - "]" + "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" ] }, { @@ -895,10 +840,7 @@ "source": [ "#### Run an evaluation task\n", "\n", - "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using the remote agent and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.\n", - "\n", - "\n", - "\n" + "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using the remote agent and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.\n" ] }, { @@ -916,33 +858,19 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 613 - }, - "executionInfo": { - "elapsed": 18114, - "status": "ok", - "timestamp": 1734468369661, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "SRv43fDcd5by", - "outputId": "cb875ccf-1ea9-4768-a606-d0e841648850" + "id": "SRv43fDcd5by" }, "outputs": [], "source": [ "single_tool_call_eval_task = EvalTask(\n", " dataset=eval_sample_dataset,\n", " metrics=single_tool_usage_metrics,\n", - " experiment=EXPERIMENT_NAME\n", + " experiment=EXPERIMENT_NAME,\n", ")\n", "\n", - "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(runnable=remote_1p_agent,\n", - " experiment_run_name=EXPERIMENT_RUN)\n", + "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", + " runnable=remote_1p_agent, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", "\n", "display_eval_report(single_tool_call_eval_result)" ] @@ -962,22 +890,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 829 - }, - "executionInfo": { - "elapsed": 322, - "status": "ok", - "timestamp": 1734468372535, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "1Jopzw83k14w", - "outputId": "3f181e69-b384-4098-a09c-e072473a2dcc" + "id": "1Jopzw83k14w" }, "outputs": [], "source": [ @@ -1027,7 +940,11 @@ "outputs": [], "source": [ "trajectory_metrics = [\n", - " \"trajectory_exact_match\", \"trajectory_in_order_match\", \"trajectory_any_order_match\", \"trajectory_precision\", \"trajectory_recall\"\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"trajectory_any_order_match\",\n", + " \"trajectory_precision\",\n", + " \"trajectory_recall\",\n", "]" ] }, @@ -1046,31 +963,14 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 960 - }, - "executionInfo": { - "elapsed": 35910, - "status": "ok", - "timestamp": 1734468421299, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "vOdS7TJUneHN", - "outputId": "eef7902b-6674-4776-dfe5-7117154cde8d" + "id": "vOdS7TJUneHN" }, "outputs": [], "source": [ "EXPERIMENT_RUN = f\"trajectory-{get_id()}\"\n", "\n", "trajectory_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=trajectory_metrics,\n", - " experiment=EXPERIMENT_NAME\n", + " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", ")\n", "\n", "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=remote_1p_agent)\n", @@ -1093,22 +993,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "executionInfo": { - "elapsed": 393, - "status": "ok", - "timestamp": 1734468421689, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "sLVRdN5llA0h", - "outputId": "65586ef4-5845-477e-8c42-ae3277b60a42" + "id": "sLVRdN5llA0h" }, "outputs": [], "source": [ @@ -1119,26 +1004,15 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 542 - }, - "executionInfo": { - "elapsed": 652, - "status": "ok", - "timestamp": 1734468540382, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "PrxM5sMZYXHP", - "outputId": "c343985c-2eac-4fbc-ca75-3fb4ddbeca2b" + "id": "PrxM5sMZYXHP" }, "outputs": [], "source": [ - "plot_bar_plot(trajectory_eval_result, title=\"Trajectory Metrics\", metrics=[f'{metric}/mean' for metric in trajectory_metrics])" + "plot_bar_plot(\n", + " trajectory_eval_result,\n", + " title=\"Trajectory Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", + ")" ] }, { @@ -1162,8 +1036,7 @@ "\n", "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", "\n", - "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n", - "\n" + "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" ] }, { @@ -1174,9 +1047,7 @@ }, "outputs": [], "source": [ - "response_metrics = [\n", - " 'safety', 'coherence'\n", - "]" + "response_metrics = [\"safety\", \"coherence\"]" ] }, { @@ -1194,31 +1065,14 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 717 - }, - "executionInfo": { - "elapsed": 25771, - "status": "ok", - "timestamp": 1734468573962, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "wRb2EC_hknSD", - "outputId": "ab6becbd-0dc3-4bf7-9fbb-9a03a2aa204a" + "id": "wRb2EC_hknSD" }, "outputs": [], "source": [ "EXPERIMENT_RUN = f\"response-{get_id()}\"\n", "\n", "response_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=response_metrics,\n", - " experiment=EXPERIMENT_NAME\n", + " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", ")\n", "\n", "response_eval_result = response_eval_task.evaluate(runnable=remote_1p_agent)\n", @@ -1242,22 +1096,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "executionInfo": { - "elapsed": 397, - "status": "ok", - "timestamp": 1734468577642, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "ZODTRuq2lF75", - "outputId": "b4e39f6a-e14f-48ef-eb8f-1039316abbee" + "id": "ZODTRuq2lF75" }, "outputs": [], "source": [ @@ -1287,8 +1126,7 @@ "\n", "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", "\n", - "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n", - "\n" + "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" ] }, { @@ -1308,7 +1146,7 @@ " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", " )\n", - " }\n", + "}\n", "\n", "pointwise_rating_rubric = {\n", " \"1\": \"Follows trajectory\",\n", @@ -1335,21 +1173,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 235, - "status": "ok", - "timestamp": 1734468591957, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "5EL7iEDMikNQ", - "outputId": "dbb1f5f1-e0e3-4d77-a29b-93bb6e6b6c54" + "id": "5EL7iEDMikNQ" }, "outputs": [], "source": [ @@ -1362,10 +1186,7 @@ "id": "e1djVp7Fi4Yy" }, "source": [ - "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n", - "\n", - "\n", - "\n" + "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" ] }, { @@ -1390,8 +1211,7 @@ "source": [ "#### Set response metrics\n", "\n", - "Set new generated response evaluation metrics by including the custom metric.\n", - "\n" + "Set new generated response evaluation metrics by including the custom metric.\n" ] }, { @@ -1403,7 +1223,10 @@ "outputs": [], "source": [ "response_tool_metrics = [\n", - " \"trajectory_exact_match\", \"trajectory_in_order_match\", \"safety\", response_follows_trajectory_metric\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"safety\",\n", + " response_follows_trajectory_metric,\n", "]" ] }, @@ -1422,22 +1245,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 943 - }, - "executionInfo": { - "elapsed": 33115, - "status": "ok", - "timestamp": 1734468628632, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "_dkb4gSn7Ywv", - "outputId": "2f64adb5-e4a0-4b3b-9841-479e3886ae7f" + "id": "_dkb4gSn7Ywv" }, "outputs": [], "source": [ @@ -1446,7 +1254,7 @@ "response_eval_tool_task = EvalTask(\n", " dataset=eval_sample_dataset,\n", " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME\n", + " experiment=EXPERIMENT_NAME,\n", ")\n", "\n", "response_eval_tool_result = response_eval_tool_task.evaluate(runnable=remote_1p_agent)\n", @@ -1469,22 +1277,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "executionInfo": { - "elapsed": 24, - "status": "ok", - "timestamp": 1734468628632, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "GH2YvXgLlLH7", - "outputId": "0a6023db-ecfa-4cc3-d03b-d9f021312a17" + "id": "GH2YvXgLlLH7" }, "outputs": [], "source": [ @@ -1499,8 +1292,7 @@ "source": [ "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", "\n", - "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n", - "\n" + "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" ] }, { @@ -1523,41 +1315,94 @@ "outputs": [], "source": [ "byod_eval_data = {\n", - " 'prompt': [\n", + " \"prompt\": [\n", " \"Get price for smartphone\",\n", " \"Get product details and price for headphones\",\n", " \"Get details for usb charger\",\n", " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\"\n", + " \"Get product details for speaker?\",\n", " ],\n", - " 'reference_trajectory': [\n", - " [{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}],\n", + " \"reference_trajectory\": [\n", " [\n", - " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}},\n", - " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", " ],\n", - " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}],\n", " [\n", - " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'shoes'}},\n", - " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'shoes'}}\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", " ],\n", - " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'speaker'}}]\n", " ],\n", - " 'generated_trajectory': [\n", - " [{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}],\n", + " \"generated_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", " [\n", - " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}},\n", - " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", " ],\n", - " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}],\n", " [\n", - " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'shoes'}},\n", - " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'shoes'}}\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", " ],\n", - " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'speaker'}}]\n", " ],\n", - " 'response': [500, 50, 'A super fast and light usb charger', 100, 'A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.']\n", - "\n", + " \"response\": [\n", + " 500,\n", + " 50,\n", + " \"A super fast and light usb charger\",\n", + " 100,\n", + " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " ],\n", "}\n", "\n", "byod_eval_sample_dataset = pd.DataFrame(eval_data)" @@ -1578,22 +1423,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 960 - }, - "executionInfo": { - "elapsed": 32637, - "status": "ok", - "timestamp": 1734468769433, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "wBD-4wpB7q-3", - "outputId": "5ff918de-eadc-40e5-b983-75a668fd1db0" + "id": "wBD-4wpB7q-3" }, "outputs": [], "source": [ @@ -1602,11 +1432,12 @@ "byod_response_eval_tool_task = EvalTask(\n", " dataset=byod_eval_sample_dataset,\n", " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME\n", + " experiment=EXPERIMENT_NAME,\n", ")\n", "\n", - "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(runnable=remote_1p_agent,\n", - " experiment_run_name=EXPERIMENT_RUN_NAME)\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", + " runnable=remote_1p_agent, experiment_run_name=EXPERIMENT_RUN_NAME\n", + ")\n", "\n", "display_eval_report(byod_response_eval_tool_result)" ] @@ -1626,22 +1457,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "executionInfo": { - "elapsed": 481, - "status": "ok", - "timestamp": 1734468772628, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "pQFzmd2I7q-3", - "outputId": "cfe0eee2-1705-4b22-d48a-dbca033ada51" + "id": "pQFzmd2I7q-3" }, "outputs": [], "source": [ @@ -1652,26 +1468,15 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 542 - }, - "executionInfo": { - "elapsed": 224, - "status": "ok", - "timestamp": 1734468783403, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "0FEbvEOkZS8f", - "outputId": "15914a55-db88-4626-e837-bc371cfb43d8" + "id": "0FEbvEOkZS8f" }, "outputs": [], "source": [ - "display_radar_plot(byod_response_eval_tool_result, title=\"Response Metrics\", metrics=[f'{metric}/mean' for metric in response_tool_metrics])" + "display_radar_plot(\n", + " byod_response_eval_tool_result,\n", + " title=\"Response Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", + ")" ] }, { @@ -1680,53 +1485,38 @@ "id": "2a4e033321ad" }, "source": [ - "## Cleaning up\n", - "\n" + "## Cleaning up\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 5561, - "status": "ok", - "timestamp": 1734337779157, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "Ox2I3UfRlTOd", - "outputId": "f2693115-5c89-4710-c823-a80546711732" + "id": "Ox2I3UfRlTOd" }, "outputs": [], "source": [ - "delete_experiment=True\n", - "delete_remote_agent=True\n", + "delete_experiment = True\n", + "delete_remote_agent = True\n", "\n", "if delete_experiment:\n", " try:\n", - " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", - " experiment.delete(delete_backing_tensorboard_runs=True)\n", + " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", + " experiment.delete(delete_backing_tensorboard_runs=True)\n", " except Exception as e:\n", - " print(e)\n", + " print(e)\n", "\n", "if delete_remote_agent:\n", " try:\n", - " remote_1p_agent.delete()\n", + " remote_1p_agent.delete()\n", " except Exception as e:\n", - " print(e)" + " print(e)" ] } ], "metadata": { "colab": { - "provenance": [], + "name": "evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb", "toc_visible": true }, "environment": { diff --git a/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb b/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb index cf7bd60348..fb08f68a52 100644 --- a/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb +++ b/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb @@ -62,11 +62,11 @@ " \"LinkedIn\n", "\n", "\n", - "\n", + "\n", " \"Bluesky\n", "\n", "\n", - "\n", + "\n", " \"X\n", "\n", "\n", @@ -113,8 +113,7 @@ "* Prepare Agent Evaluation dataset\n", "* Single tool usage evaluation\n", "* Trajectory evaluation\n", - "* Response evaluation\n", - "\n" + "* Response evaluation\n" ] }, { @@ -139,21 +138,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 57379, - "status": "ok", - "timestamp": 1734507469619, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "tFy3H3aPgx12", - "outputId": "3b6356f6-3831-4366-e029-2cb3b34b4d2f" + "id": "tFy3H3aPgx12" }, "outputs": [ { @@ -306,21 +291,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 240, - "status": "ok", - "timestamp": 1734507927111, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "XRvKdaPDTznN", - "outputId": "bd462f4a-0cfc-429f-d955-e5df75e94773" + "id": "XRvKdaPDTznN" }, "outputs": [ { @@ -396,22 +367,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 112 - }, - "executionInfo": { - "elapsed": 7562, - "status": "ok", - "timestamp": 1734508011536, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "Nqwi-5ufWp_B", - "outputId": "d9d462c6-03cd-4c16-f5e7-55ad485e7dc2" + "id": "Nqwi-5ufWp_B" }, "outputs": [ { @@ -527,9 +483,14 @@ "\n", "! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI\n", "\n", - "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", + "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", "\n", - "vertexai.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI, experiment=EXPERIMENT_NAME)" + "vertexai.init(\n", + " project=PROJECT_ID,\n", + " location=LOCATION,\n", + " staging_bucket=BUCKET_URI,\n", + " experiment=EXPERIMENT_NAME,\n", + ")" ] }, { @@ -554,22 +515,24 @@ "# General\n", "import random\n", "import string\n", - "import json\n", - "import pandas as pd\n", - "import plotly.graph_objects as go\n", - "from IPython.display import display, Markdown, HTML\n", - "from typing import Callable, Sequence, TypedDict, Annotated, Literal\n", + "from typing import Literal\n", + "\n", + "from IPython.display import HTML, Markdown, display\n", "\n", "# Build agent\n", - "from crewai import Agent, Task, Crew, Process\n", "from crewai_tools import tool\n", - "from crewai.flow.flow import Flow, listen, start\n", - "from vertexai.preview import reasoning_engines\n", "\n", "# Evaluate agent\n", "from google.cloud import aiplatform\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "from vertexai.preview import reasoning_engines\n", "from vertexai.preview.evaluation import EvalTask\n", - "from vertexai.preview.evaluation.metrics import TrajectorySingleToolUse, PointwiseMetricPromptTemplate, PointwiseMetric" + "from vertexai.preview.evaluation.metrics import (\n", + " PointwiseMetric,\n", + " PointwiseMetricPromptTemplate,\n", + " TrajectorySingleToolUse,\n", + ")" ] }, { @@ -595,86 +558,106 @@ " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", "\n", + "\n", "def parse_crewai_output_to_dictionary(crew, crew_output):\n", " \"\"\"\n", " Parse CrewAI output into a structured dictionary format.\n", " \"\"\"\n", - " final_output = {\n", - " 'response': str(crew_output),\n", - " 'predicted_trajectory': []\n", - " }\n", + " final_output = {\"response\": str(crew_output), \"predicted_trajectory\": []}\n", "\n", " try:\n", " # Access tools_results directly from each agent\n", " for agent in crew.agents:\n", - " if hasattr(agent, 'tools_results'):\n", + " if hasattr(agent, \"tools_results\"):\n", " for tool_result in agent.tools_results:\n", " tool_info = {\n", - " 'tool_name': tool_result.get('tool_name', ''),\n", - " 'tool_input': tool_result.get('tool_args', {})\n", + " \"tool_name\": tool_result.get(\"tool_name\", \"\"),\n", + " \"tool_input\": tool_result.get(\"tool_args\", {}),\n", " }\n", - " final_output['predicted_trajectory'].append(tool_info)\n", + " final_output[\"predicted_trajectory\"].append(tool_info)\n", "\n", " except Exception as e:\n", - " final_output['error'] = f\"Error parsing tools results: {str(e)}\"\n", + " final_output[\"error\"] = f\"Error parsing tools results: {str(e)}\"\n", "\n", " return final_output\n", "\n", + "\n", "def format_output_as_markdown(output: dict) -> str:\n", " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", " markdown = \"### AI Response\\n\"\n", " markdown += f\"{output['response']}\\n\\n\"\n", "\n", - " if output['predicted_trajectory']:\n", + " if output[\"predicted_trajectory\"]:\n", " markdown += \"### Function Calls\\n\"\n", - " for call in output['predicted_trajectory']:\n", + " for call in output[\"predicted_trajectory\"]:\n", " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", " markdown += \" - **Arguments**:\\n\"\n", - " for key, value in call['tool_input'].items():\n", + " for key, value in call[\"tool_input\"].items():\n", " markdown += f\" - `{key}`: `{value}`\\n\"\n", "\n", " return markdown\n", "\n", + "\n", "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", " \"\"\"Display the evaluation results.\"\"\"\n", - " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient='index').T\n", + " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", " display(Markdown(\"### Summary Metrics\"))\n", " display(metrics_df)\n", "\n", " display(Markdown(f\"### Row-wise Metrics\"))\n", " display(eval_result.metrics_table)\n", "\n", + "\n", "def display_drilldown(row: pd.Series) -> None:\n", " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", "\n", " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", "\n", - " if not (isinstance(row['predicted_trajectory'], list) and isinstance(row['reference_trajectory'], list)):\n", - " return\n", - "\n", - " for predicted_trajectory, reference_trajectory in zip(row['predicted_trajectory'], row['reference_trajectory']):\n", - " display(HTML(f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"))\n", - "\n", - "\n", - " if not (isinstance(predicted_trajectory.get('tool_input'), dict) and isinstance(reference_trajectory.get('tool_input'), dict)):\n", - " continue\n", - "\n", - "\n", - " for tool_input_key in predicted_trajectory['tool_input']:\n", - " print(\"Tool Input Key: \", tool_input_key)\n", + " if not (\n", + " isinstance(row[\"predicted_trajectory\"], list)\n", + " and isinstance(row[\"reference_trajectory\"], list)\n", + " ):\n", + " return\n", + "\n", + " for predicted_trajectory, reference_trajectory in zip(\n", + " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", + " ):\n", + " display(\n", + " HTML(\n", + " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", + " )\n", + " )\n", "\n", - " if tool_input_key in reference_trajectory['tool_input']:\n", - " print(\"Tool Values: \", predicted_trajectory['tool_input'][tool_input_key], reference_trajectory['tool_input'][tool_input_key])\n", - " else:\n", - " print(\"Tool Values: \", predicted_trajectory['tool_input'][tool_input_key], \"N/A\")\n", - " print(\"\\n\")\n", + " if not (\n", + " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", + " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", + " ):\n", + " continue\n", + "\n", + " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", + " print(\"Tool Input Key: \", tool_input_key)\n", + "\n", + " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " reference_trajectory[\"tool_input\"][tool_input_key],\n", + " )\n", + " else:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " \"N/A\",\n", + " )\n", + " print(\"\\n\")\n", " display(HTML(\"
\"))\n", "\n", + "\n", "def display_dataframe_rows(\n", " df: pd.DataFrame,\n", " columns: list[str] | None = None,\n", " num_rows: int = 3,\n", - " display_drilldown: bool = False\n", + " display_drilldown: bool = False,\n", ") -> None:\n", " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", "\n", @@ -686,13 +669,22 @@ "\n", " for _, row in df.head(num_rows).iterrows():\n", " for column in df.columns:\n", - " display(HTML(f\"{column.replace('_', ' ').title()}: \"))\n", + " display(\n", + " HTML(\n", + " f\"{column.replace('_', ' ').title()}: \"\n", + " )\n", + " )\n", " display(HTML(f\"{row[column]}
\"))\n", "\n", " display(HTML(\"
\"))\n", "\n", - " if display_drilldown and 'predicted_trajectory' in df.columns and 'reference_trajectory' in df.columns:\n", - " display_drilldown(row)\n", + " if (\n", + " display_drilldown\n", + " and \"predicted_trajectory\" in df.columns\n", + " and \"reference_trajectory\" in df.columns\n", + " ):\n", + " display_drilldown(row)\n", + "\n", "\n", "def plot_bar_plot(\n", " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", @@ -722,6 +714,7 @@ " fig.update_layout(barmode=\"group\")\n", " fig.show()\n", "\n", + "\n", "def display_radar_plot(eval_results, title: str, metrics=None):\n", " \"\"\"Plot the radar plot.\"\"\"\n", " fig = go.Figure()\n", @@ -747,7 +740,7 @@ " fig.update_layout(\n", " title=title,\n", " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", - " showlegend=True\n", + " showlegend=True,\n", " )\n", " fig.show()" ] @@ -760,8 +753,7 @@ "source": [ "## Build an agent using Vertex AI Reasoning Engine's customized template\n", "\n", - "Build and deploy your application using LangGraph, including the Gemini model and custom tools that you define.\n", - "\n" + "Build and deploy your application using LangGraph, including the Gemini model and custom tools that you define.\n" ] }, { @@ -828,7 +820,9 @@ }, "outputs": [], "source": [ - "def router(state: list[BaseMessage]) -> Literal[\"get_product_details\", \"get_product_price\", \"__end__\"]:\n", + "def router(\n", + " state: list[BaseMessage],\n", + ") -> Literal[\"get_product_details\", \"get_product_price\", \"__end__\"]:\n", " \"\"\"Initiates product details or price retrieval if the user asks for a product.\"\"\"\n", " # Get the tool_calls from the last message in the conversation history.\n", " tool_calls = state[-1].tool_calls\n", @@ -836,7 +830,7 @@ " # If there are any tool_calls\n", " if tool_calls:\n", " # Check the function name in the first tool call\n", - " function_name = tool_calls[0].get('name')\n", + " function_name = tool_calls[0].get(\"name\")\n", " if function_name == \"get_product_price\":\n", " return \"get_product_price\"\n", " else:\n", @@ -950,22 +944,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 190 - }, - "executionInfo": { - "elapsed": 1688, - "status": "ok", - "timestamp": 1734506144463, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "PgkOhPmN3aCZ", - "outputId": "dfba408f-6510-4eb8-acd2-e845e360a6ef" + "id": "PgkOhPmN3aCZ" }, "outputs": [ { @@ -989,29 +968,16 @@ ], "source": [ "response = local_custom_agent.query(input=\"Get product details for shoes\")\n", - "display(Markdown(format_output_as_markdown(parse_messages_to_output_dictionary(response))))" + "display(\n", + " Markdown(format_output_as_markdown(parse_messages_to_output_dictionary(response)))\n", + ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 190 - }, - "executionInfo": { - "elapsed": 977, - "status": "ok", - "timestamp": 1734506145439, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "lGb58OJkjUs9", - "outputId": "ca9ea3b9-7141-43e0-b169-fb30bd028509" + "id": "lGb58OJkjUs9" }, "outputs": [ { @@ -1035,7 +1001,9 @@ ], "source": [ "response = local_custom_agent.query(input=\"Get product price for shoes\")\n", - "display(Markdown(format_output_as_markdown(parse_messages_to_output_dictionary(response))))" + "display(\n", + " Markdown(format_output_as_markdown(parse_messages_to_output_dictionary(response)))\n", + ")" ] }, { @@ -1057,21 +1025,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 239473, - "status": "ok", - "timestamp": 1734472951301, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "3HLz_a1We4QE", - "outputId": "e95d4768-a790-4910-c172-94f90bb0a8bd" + "id": "3HLz_a1We4QE" }, "outputs": [ { @@ -1122,22 +1076,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 190 - }, - "executionInfo": { - "elapsed": 1834, - "status": "ok", - "timestamp": 1734506153310, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "sqBtzYyce4QE", - "outputId": "16f3296b-9dd4-404a-8453-00fd799118ff" + "id": "sqBtzYyce4QE" }, "outputs": [ { @@ -1161,7 +1100,9 @@ ], "source": [ "response = remote_custom_agent.query(input=\"Get product details for shoes\")\n", - "display(Markdown(format_output_as_markdown(parse_messages_to_output_dictionary(response))))" + "display(\n", + " Markdown(format_output_as_markdown(parse_messages_to_output_dictionary(response)))\n", + ")" ] }, { @@ -1218,26 +1159,50 @@ "outputs": [], "source": [ "eval_data = {\n", - " 'prompt': [\n", + " \"prompt\": [\n", " \"Get price for smartphone\",\n", " \"Get product details and price for headphones\",\n", " \"Get details for usb charger\",\n", " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\"\n", + " \"Get product details for speaker?\",\n", " ],\n", - " 'reference_trajectory': [\n", - " [{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", " [\n", - " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}},\n", - " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", " ],\n", - " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}],\n", " [\n", - " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'shoes'}},\n", - " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'shoes'}}\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", " ],\n", - " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'speaker'}}]\n", - " ]\n", + " ],\n", "}\n", "\n", "eval_sample_dataset = pd.DataFrame(eval_data)" @@ -1256,22 +1221,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 273 - }, - "executionInfo": { - "elapsed": 388, - "status": "ok", - "timestamp": 1734506157571, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "EjsonqWWvIvE", - "outputId": "0dbf8eec-171c-4fa2-943f-6e4e936a0d94" + "id": "EjsonqWWvIvE" }, "outputs": [ { @@ -1479,14 +1429,14 @@ "outputs": [], "source": [ "def agent_parsed_response(input: str) -> dict:\n", - " \"\"\"Parse the agent output and pass it to Vertex AI Gen AI Evaluation.\"\"\"\n", + " \"\"\"Parse the agent output and pass it to Vertex AI Gen AI Evaluation.\"\"\"\n", "\n", - " result = remote_custom_agent.query(input=input)\n", + " result = remote_custom_agent.query(input=input)\n", "\n", - " # Parse function calls separately\n", - " agent_output = parse_messages_to_output_dictionary(result)\n", + " # Parse function calls separately\n", + " agent_output = parse_messages_to_output_dictionary(result)\n", "\n", - " return agent_output" + " return agent_output" ] }, { @@ -1497,8 +1447,7 @@ "source": [ "### Single tool usage evaluation\n", "\n", - "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n", - "\n" + "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" ] }, { @@ -1522,9 +1471,7 @@ }, "outputs": [], "source": [ - "single_tool_usage_metrics = [\n", - " TrajectorySingleToolUse(tool_name='get_product_price')\n", - "]" + "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" ] }, { @@ -1535,10 +1482,7 @@ "source": [ "#### Run an evaluation task\n", "\n", - "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using a local agent (local_1p_agent) and assigns a unique identifier to this specific evaluation run, storing the evaluation results.\n", - "\n", - "\n", - "\n" + "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using a local agent (local_1p_agent) and assigns a unique identifier to this specific evaluation run, storing the evaluation results.\n" ] }, { @@ -1556,22 +1500,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 700 - }, - "executionInfo": { - "elapsed": 15594, - "status": "ok", - "timestamp": 1734506517045, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "SRv43fDcd5by", - "outputId": "65edaf55-79c4-404c-d5ab-a2e75898e9b8" + "id": "SRv43fDcd5by" }, "outputs": [ { @@ -2352,11 +2281,12 @@ "single_tool_call_eval_task = EvalTask(\n", " dataset=eval_sample_dataset,\n", " metrics=single_tool_usage_metrics,\n", - " experiment=EXPERIMENT_NAME\n", + " experiment=EXPERIMENT_NAME,\n", ")\n", "\n", - "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(runnable=agent_parsed_response,\n", - " experiment_run_name=EXPERIMENT_RUN)\n", + "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", + " runnable=agent_parsed_response, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", "\n", "display_eval_report(single_tool_call_eval_result)" ] @@ -2376,22 +2306,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 794 - }, - "executionInfo": { - "elapsed": 242, - "status": "ok", - "timestamp": 1734506577360, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "ZkpwPReipekr", - "outputId": "6e8961d7-a66e-49e4-fad6-4d637fdf6dd0" + "id": "ZkpwPReipekr" }, "outputs": [ { @@ -2982,7 +2897,11 @@ "outputs": [], "source": [ "trajectory_metrics = [\n", - " \"trajectory_exact_match\", \"trajectory_in_order_match\", \"trajectory_any_order_match\", \"trajectory_precision\", \"trajectory_recall\"\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"trajectory_any_order_match\",\n", + " \"trajectory_precision\",\n", + " \"trajectory_recall\",\n", "]" ] }, @@ -3001,22 +2920,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 279 - }, - "executionInfo": { - "elapsed": 32287, - "status": "ok", - "timestamp": 1734506644979, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "vOdS7TJUneHN", - "outputId": "d5df1ad4-5ff6-4bec-d15f-eeac5921c601" + "id": "vOdS7TJUneHN" }, "outputs": [ { @@ -3286,12 +3190,10 @@ } ], "source": [ - "EXPERIMENT_NAME = \"evaluate-re-agent-trajectory\" # @param {type:\"string\"}\n", + "EXPERIMENT_NAME = \"evaluate-re-agent-trajectory\" # @param {type:\"string\"}\n", "\n", "trajectory_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=trajectory_metrics,\n", - " experiment=EXPERIMENT_NAME\n", + " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", ")\n", "\n", "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=agent_parsed_response)\n", @@ -3314,22 +3216,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "executionInfo": { - "elapsed": 483, - "status": "ok", - "timestamp": 1734506658651, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "sLVRdN5llA0h", - "outputId": "457f3216-2323-4bdf-eda5-7a22b88ca54f" + "id": "sLVRdN5llA0h" }, "outputs": [ { @@ -4169,22 +4056,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 542 - }, - "executionInfo": { - "elapsed": 244, - "status": "ok", - "timestamp": 1734506659132, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "PrxM5sMZYXHP", - "outputId": "40cbb7f1-eb52-4fd7-af16-14897a629f5f" + "id": "PrxM5sMZYXHP" }, "outputs": [ { @@ -4228,7 +4100,11 @@ } ], "source": [ - "plot_bar_plot(trajectory_eval_result, title=\"Trajectory Metrics\", metrics=[f'{metric}/mean' for metric in trajectory_metrics])" + "plot_bar_plot(\n", + " trajectory_eval_result,\n", + " title=\"Trajectory Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", + ")" ] }, { @@ -4252,8 +4128,7 @@ "\n", "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", "\n", - "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n", - "\n" + "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" ] }, { @@ -4264,9 +4139,7 @@ }, "outputs": [], "source": [ - "response_metrics = [\n", - " 'safety', 'coherence'\n", - "]" + "response_metrics = [\"safety\", \"coherence\"]" ] }, { @@ -4284,22 +4157,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 279 - }, - "executionInfo": { - "elapsed": 20843, - "status": "ok", - "timestamp": 1734506685051, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "wRb2EC_hknSD", - "outputId": "6207f313-1040-418a-c506-fcae56b4c170" + "id": "wRb2EC_hknSD" }, "outputs": [ { @@ -4569,12 +4427,10 @@ } ], "source": [ - "EXPERIMENT_NAME = \"evaluate-re-agent-response\" # @param {type:\"string\"}\n", + "EXPERIMENT_NAME = \"evaluate-re-agent-response\" # @param {type:\"string\"}\n", "\n", "response_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=response_metrics,\n", - " experiment=EXPERIMENT_NAME\n", + " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", ")\n", "\n", "response_eval_result = response_eval_task.evaluate(runnable=agent_parsed_response)\n", @@ -4598,22 +4454,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "executionInfo": { - "elapsed": 240, - "status": "ok", - "timestamp": 1734506703538, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "cy0aRydrp9zW", - "outputId": "325bda1f-a89e-4117-85fe-b7d452b4da87" + "id": "cy0aRydrp9zW" }, "outputs": [ { @@ -5410,8 +5251,7 @@ "\n", "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", "\n", - "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n", - "\n" + "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" ] }, { @@ -5431,7 +5271,7 @@ " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", " )\n", - " }\n", + "}\n", "\n", "pointwise_rating_rubric = {\n", " \"1\": \"Follows trajectory\",\n", @@ -5458,21 +5298,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "executionInfo": { - "elapsed": 435, - "status": "ok", - "timestamp": 1734506717333, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "5EL7iEDMikNQ", - "outputId": "8a7fc362-3449-426a-a244-5fd380d219af" + "id": "5EL7iEDMikNQ" }, "outputs": [ { @@ -5528,10 +5354,7 @@ "id": "e1djVp7Fi4Yy" }, "source": [ - "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n", - "\n", - "\n", - "\n" + "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" ] }, { @@ -5556,8 +5379,7 @@ "source": [ "#### Set response metrics\n", "\n", - "Set new generated response evaluation metrics by including the custom metric.\n", - "\n" + "Set new generated response evaluation metrics by including the custom metric.\n" ] }, { @@ -5569,7 +5391,10 @@ "outputs": [], "source": [ "response_tool_metrics = [\n", - " \"trajectory_exact_match\", \"trajectory_in_order_match\", \"safety\", response_follows_trajectory_metric\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"safety\",\n", + " response_follows_trajectory_metric,\n", "]" ] }, @@ -5588,22 +5413,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 891 - }, - "executionInfo": { - "elapsed": 28503, - "status": "ok", - "timestamp": 1734506756916, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "_dkb4gSn7Ywv", - "outputId": "5f647c02-7e90-433c-d4d6-910ea18b7133" + "id": "_dkb4gSn7Ywv" }, "outputs": [ { @@ -6453,15 +6263,17 @@ } ], "source": [ - "EXPERIMENT_NAME = \"evaluate-re-agent-response-by-tools\" # @param {type:\"string\"}\n", + "EXPERIMENT_NAME = \"evaluate-re-agent-response-by-tools\" # @param {type:\"string\"}\n", "\n", "response_eval_tool_task = EvalTask(\n", " dataset=eval_sample_dataset,\n", " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME\n", + " experiment=EXPERIMENT_NAME,\n", ")\n", "\n", - "response_eval_tool_result = response_eval_tool_task.evaluate(runnable=agent_parsed_response)\n", + "response_eval_tool_result = response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_response\n", + ")\n", "\n", "display_eval_report(response_eval_tool_result)" ] @@ -6482,22 +6294,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "executionInfo": { - "elapsed": 239, - "status": "ok", - "timestamp": 1734506757152, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "ZODTRuq2lF75", - "outputId": "7dd44083-885d-4811-89b9-25abc88e95de" + "id": "ZODTRuq2lF75" }, "outputs": [ { @@ -7279,8 +7076,7 @@ "source": [ "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", "\n", - "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n", - "\n" + "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" ] }, { @@ -7303,41 +7099,94 @@ "outputs": [], "source": [ "byod_eval_data = {\n", - " 'prompt': [\n", + " \"prompt\": [\n", " \"Get price for smartphone\",\n", " \"Get product details and price for headphones\",\n", " \"Get details for usb charger\",\n", " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\"\n", + " \"Get product details for speaker?\",\n", " ],\n", - " 'reference_trajectory': [\n", - " [{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", " [\n", - " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}},\n", - " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", " ],\n", - " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}],\n", " [\n", - " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'shoes'}},\n", - " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'shoes'}}\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", " ],\n", - " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'speaker'}}]\n", " ],\n", - " 'generated_trajectory': [\n", - " [{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}],\n", + " \"generated_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", " [\n", - " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}},\n", - " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", " ],\n", - " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}],\n", " [\n", - " {'tool_name': 'get_product_details', 'tool_input': {'product_name': 'shoes'}},\n", - " {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'shoes'}}\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", " ],\n", - " [{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'speaker'}}]\n", " ],\n", - " 'response': [500, 50, 'A super fast and light usb charger', 100, 'A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.']\n", - "\n", + " \"response\": [\n", + " 500,\n", + " 50,\n", + " \"A super fast and light usb charger\",\n", + " 100,\n", + " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " ],\n", "}\n", "\n", "byod_eval_sample_dataset = pd.DataFrame(eval_data)" @@ -7358,22 +7207,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 891 - }, - "executionInfo": { - "elapsed": 30095, - "status": "ok", - "timestamp": 1734506845575, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "wBD-4wpB7q-3", - "outputId": "a0db90f4-0a90-4635-ee18-3a8479a71658" + "id": "wBD-4wpB7q-3" }, "outputs": [ { @@ -8228,11 +8062,12 @@ "byod_response_eval_tool_task = EvalTask(\n", " dataset=byod_eval_sample_dataset,\n", " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME\n", + " experiment=EXPERIMENT_NAME,\n", ")\n", "\n", - "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(runnable=agent_parsed_response,\n", - " experiment_run_name=EXPERIMENT_RUN_NAME)\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_response, experiment_run_name=EXPERIMENT_RUN_NAME\n", + ")\n", "\n", "display_eval_report(byod_response_eval_tool_result)" ] @@ -8252,22 +8087,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "executionInfo": { - "elapsed": 253, - "status": "ok", - "timestamp": 1734506845825, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "pQFzmd2I7q-3", - "outputId": "24e51c3b-e104-471d-e15f-e63d925d0fd7" + "id": "pQFzmd2I7q-3" }, "outputs": [ { @@ -9179,33 +8999,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "pQUZRGb3rLC0" - }, - "outputs": [], - "source": [ - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 542 - }, - "executionInfo": { - "elapsed": 2, - "status": "ok", - "timestamp": 1734507187235, - "user": { - "displayName": "Ivan Nardini", - "userId": "04192340647469915671" - }, - "user_tz": -120 - }, - "id": "0FEbvEOkZS8f", - "outputId": "94a95394-05fe-47f4-ce9c-301ce311bcf5" + "id": "0FEbvEOkZS8f" }, "outputs": [ { @@ -9249,7 +9043,11 @@ } ], "source": [ - "display_radar_plot(byod_response_eval_tool_result, title=\"Response Metrics\", metrics=[f'{metric}/mean' for metric in response_tool_metrics])" + "display_radar_plot(\n", + " byod_response_eval_tool_result,\n", + " title=\"Response Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", + ")" ] }, { @@ -9258,8 +9056,7 @@ "id": "2a4e033321ad" }, "source": [ - "## Cleaning up\n", - "\n" + "## Cleaning up\n" ] }, { @@ -9270,27 +9067,27 @@ }, "outputs": [], "source": [ - "delete_experiment=True\n", - "delete_remote_agent=True\n", + "delete_experiment = True\n", + "delete_remote_agent = True\n", "\n", "if delete_experiment:\n", " try:\n", - " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", - " experiment.delete(delete_backing_tensorboard_runs=True)\n", + " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", + " experiment.delete(delete_backing_tensorboard_runs=True)\n", " except Exception as e:\n", - " print(e)\n", + " print(e)\n", "\n", "if delete_remote_agent:\n", " try:\n", - " remote_custom_agent.delete()\n", + " remote_custom_agent.delete()\n", " except Exception as e:\n", - " print(e)" + " print(e)" ] } ], "metadata": { "colab": { - "provenance": [], + "name": "evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb", "toc_visible": true }, "environment": { From 239ed6722792da933d82c8aee6533d947a36a859 Mon Sep 17 00:00:00 2001 From: inardini Date: Wed, 18 Dec 2024 10:02:01 +0000 Subject: [PATCH 03/14] nox passed --- .../evaluation/evaluating_crewai_agent.ipynb | 3161 ++- .../evaluating_langgraph_agent.ipynb | 3137 ++- ...reasoning_engine_customized_template.ipynb | 3407 ++- ...t_reasoning_engine_prebuilt_template.ipynb | 3073 ++- ...reasoning_engine_customized_template.ipynb | 17895 ++++++++-------- 5 files changed, 15289 insertions(+), 15384 deletions(-) diff --git a/gemini/evaluation/evaluating_crewai_agent.ipynb b/gemini/evaluation/evaluating_crewai_agent.ipynb index c588b87a11..56700ff712 100644 --- a/gemini/evaluation/evaluating_crewai_agent.ipynb +++ b/gemini/evaluation/evaluating_crewai_agent.ipynb @@ -1,1592 +1,1573 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ur8xi4C7S06n" - }, - "outputs": [], - "source": [ - "# Copyright 2024 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JAPoU8Sm5E6e" - }, - "source": [ - "# Evaluating Agents - Evaluate a CrewAI agent with Vertex AI Gen AI Evaluation\n", - "\n", - "
\n", - " \n", + " \n", " \"Google
Open in Colab\n", "
\n", "
\n", - " \n", + " \n", " \"Google
Open in Colab Enterprise\n", "
\n", "
\n", - " \n", + " \n", " \"Vertex
Open in Vertex AI Workbench\n", "
\n", "
\n", - " \n", + " \n", " \"GitHub
View on GitHub\n", "
\n", "
\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \"Google
Open in Colab\n", - "
\n", - "
\n", - " \n", - " \"Google
Open in Colab Enterprise\n", - "
\n", - "
\n", - " \n", - " \"Vertex
Open in Vertex AI Workbench\n", - "
\n", - "
\n", - " \n", - " \"GitHub
View on GitHub\n", - "
\n", - "
\n", - "\n", - "
\n", - "\n", - "Share to:\n", - "\n", - "\n", - " \"LinkedIn\n", - "\n", - "\n", - "\n", - " \"Bluesky\n", - "\n", - "\n", - "\n", - " \"X\n", - "\n", - "\n", - "\n", - " \"Reddit\n", - "\n", - "\n", - "\n", - " \"Facebook\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "84f0f73a0f76" - }, - "source": [ - "| | |\n", - "|-|-|\n", - "| Authors | [Ivan Nardini](https://github.com/inardini), [Naveksha Sood](https://github.com/navekshasood) |" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tvgnzT1CKxrO" - }, - "source": [ - "## Overview\n", - "\n", - "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", - "\n", - "Vertex AI Model Evaluation provides a toolkit of quality controlled and explainable methods and metrics to evaluate any generative model or application, including agents, and benchmark the evaluation results against your own judgment, using your own evaluation criteria.\n", - "\n", - "This tutorial shows how to evaluate a Crew AI agent using Vertex AI Gen AI Evaluation for agent evaluation.\n", - "\n", - "The tutorial uses the following Google Cloud services and resources:\n", - "\n", - "* Vertex AI Gen AI Evaluation\n", - "\n", - "The steps performed include:\n", - "\n", - "* Build local agent using Crew AI\n", - "* Prepare Agent Evaluation dataset\n", - "* Single tool usage evaluation\n", - "* Trajectory evaluation\n", - "* Response evaluation\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "61RBz8LLbxCR" - }, - "source": [ - "## Get started" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "No17Cw5hgx12" - }, - "source": [ - "### Install Vertex AI SDK and other required packages\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tFy3H3aPgx12" - }, - "outputs": [], - "source": [ - "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation]\" \\\n", - " \"crewai\" \"crewai-tools\" \\\n", - " \"cloudpickle==3.0.0\" \\\n", - " \"pydantic==2.7.4\" \\\n", - " \"requests\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "R5Xep4W9lq-Z" - }, - "source": [ - "### Restart runtime\n", - "\n", - "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", - "\n", - "The restart might take a minute or longer. After it's restarted, continue to the next step." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XRvKdaPDTznN" - }, - "outputs": [], - "source": [ - "import IPython\n", - "\n", - "app = IPython.Application.instance()\n", - "app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SbmM4z7FOBpM" - }, - "source": [ - "
\n", - "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", - "
\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dmWOrTJ3gx13" - }, - "source": [ - "### Authenticate your notebook environment (Colab only)\n", - "\n", - "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NyKGtVQjgx13" - }, - "outputs": [], - "source": [ - "import sys\n", - "\n", - "if \"google.colab\" in sys.modules:\n", - " from google.colab import auth\n", - "\n", - " auth.authenticate_user()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF4l8DTdWgPY" - }, - "source": [ - "### Set Google Cloud project information and initialize Vertex AI SDK\n", - "\n", - "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", - "\n", - "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nqwi-5ufWp_B" - }, - "outputs": [], - "source": [ - "# Use the environment variable if the user doesn't provide Project ID.\n", - "import os\n", - "\n", - "import vertexai\n", - "\n", - "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", - "\n", - "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", - " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", - "os.environ[\"GOOGLE_CLOUD_PROJECT\"] = PROJECT_ID\n", - "\n", - "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", - "\n", - "EXPERIMENT_NAME = \"evaluate-crewai-agent\" # @param {type:\"string\"}\n", - "\n", - "vertexai.init(project=PROJECT_ID, location=LOCATION, experiment=EXPERIMENT_NAME)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5303c05f7aa6" - }, - "source": [ - "## Import libraries\n", - "\n", - "Import tutorial libraries." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6fc324893334" - }, - "outputs": [], - "source": [ - "# General\n", - "import random\n", - "import string\n", - "import warnings\n", - "\n", - "from IPython.display import HTML, Markdown, display\n", - "import pandas as pd\n", - "import plotly.graph_objects as go\n", - "\n", - "warnings.filterwarnings(\"ignore\", category=Warning, module=\"opentelemetry.trace\")\n", - "\n", - "# Build agent\n", - "from crewai import Agent, Crew, Process, Task\n", - "from crewai.flow.flow import Flow, listen, start\n", - "from crewai_tools import tool\n", - "\n", - "# Evaluate agent\n", - "from google.cloud import aiplatform\n", - "from vertexai.preview.evaluation import EvalTask\n", - "from vertexai.preview.evaluation.metrics import (\n", - " PointwiseMetric,\n", - " PointwiseMetricPromptTemplate,\n", - " TrajectorySingleToolUse,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MVnBDX54gz7j" - }, - "source": [ - "## Define helper functions\n", - "\n", - "Initiate a set of helper functions to print tutorial results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uSgWjMD_g1_v" - }, - "outputs": [], - "source": [ - "def get_id(length: int = 8) -> str:\n", - " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", - " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", - "\n", - "\n", - "def parse_crewai_output_to_dictionary(crew, crew_output):\n", - " \"\"\"\n", - " Parse CrewAI output into a structured dictionary format.\n", - " \"\"\"\n", - " final_output = {\"response\": str(crew_output), \"predicted_trajectory\": []}\n", - "\n", - " try:\n", - " # Access tools_results directly from each agent\n", - " for agent in crew.agents:\n", - " if hasattr(agent, \"tools_results\"):\n", - " for tool_result in agent.tools_results:\n", - " tool_info = {\n", - " \"tool_name\": tool_result.get(\"tool_name\", \"\"),\n", - " \"tool_input\": tool_result.get(\"tool_args\", {}),\n", - " }\n", - " final_output[\"predicted_trajectory\"].append(tool_info)\n", - "\n", - " except Exception as e:\n", - " final_output[\"error\"] = f\"Error parsing tools results: {str(e)}\"\n", - "\n", - " return final_output\n", - "\n", - "\n", - "def format_output_as_markdown(output: dict) -> str:\n", - " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", - " markdown = \"### AI Response\\n\"\n", - " markdown += f\"{output['response']}\\n\\n\"\n", - "\n", - " if output[\"predicted_trajectory\"]:\n", - " markdown += \"### Function Calls\\n\"\n", - " for call in output[\"predicted_trajectory\"]:\n", - " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", - " markdown += \" - **Arguments**:\\n\"\n", - " for key, value in call[\"tool_input\"].items():\n", - " markdown += f\" - `{key}`: `{value}`\\n\"\n", - "\n", - " return markdown\n", - "\n", - "\n", - "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", - " \"\"\"Display the evaluation results.\"\"\"\n", - " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", - " display(Markdown(\"### Summary Metrics\"))\n", - " display(metrics_df)\n", - "\n", - " display(Markdown(f\"### Row-wise Metrics\"))\n", - " display(eval_result.metrics_table)\n", - "\n", - "\n", - "def display_drilldown(row: pd.Series) -> None:\n", - " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", - "\n", - " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", - "\n", - " if not (\n", - " isinstance(row[\"predicted_trajectory\"], list)\n", - " and isinstance(row[\"reference_trajectory\"], list)\n", - " ):\n", - " return\n", - "\n", - " for predicted_trajectory, reference_trajectory in zip(\n", - " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", - " ):\n", - " display(\n", - " HTML(\n", - " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", - " )\n", - " )\n", - "\n", - " if not (\n", - " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", - " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", - " ):\n", - " continue\n", - "\n", - " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", - " print(\"Tool Input Key: \", tool_input_key)\n", - "\n", - " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", - " print(\n", - " \"Tool Values: \",\n", - " predicted_trajectory[\"tool_input\"][tool_input_key],\n", - " reference_trajectory[\"tool_input\"][tool_input_key],\n", - " )\n", - " else:\n", - " print(\n", - " \"Tool Values: \",\n", - " predicted_trajectory[\"tool_input\"][tool_input_key],\n", - " \"N/A\",\n", - " )\n", - " print(\"\\n\")\n", - " display(HTML(\"
\"))\n", - "\n", - "\n", - "def display_dataframe_rows(\n", - " df: pd.DataFrame,\n", - " columns: list[str] | None = None,\n", - " num_rows: int = 3,\n", - " display_drilldown: bool = False,\n", - ") -> None:\n", - " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", - "\n", - " if columns:\n", - " df = df[columns]\n", - "\n", - " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", - " header_style = base_style + \"font-weight: bold;\"\n", - "\n", - " for _, row in df.head(num_rows).iterrows():\n", - " for column in df.columns:\n", - " display(\n", - " HTML(\n", - " f\"{column.replace('_', ' ').title()}: \"\n", - " )\n", - " )\n", - " display(HTML(f\"{row[column]}
\"))\n", - "\n", - " display(HTML(\"
\"))\n", - "\n", - " if (\n", - " display_drilldown\n", - " and \"predicted_trajectory\" in df.columns\n", - " and \"reference_trajectory\" in df.columns\n", - " ):\n", - " display_drilldown(row)\n", - "\n", - "\n", - "def plot_bar_plot(\n", - " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", - ") -> None:\n", - " fig = go.Figure()\n", - " data = []\n", - "\n", - " summary_metrics = eval_result.summary_metrics\n", - " if metrics:\n", - " summary_metrics = {\n", - " k: summary_metrics[k]\n", - " for k, v in summary_metrics.items()\n", - " if any(selected_metric in k for selected_metric in metrics)\n", - " }\n", - "\n", - " data.append(\n", - " go.Bar(\n", - " x=list(summary_metrics.keys()),\n", - " y=list(summary_metrics.values()),\n", - " name=title,\n", - " )\n", - " )\n", - "\n", - " fig = go.Figure(data=data)\n", - "\n", - " # Change the bar mode\n", - " fig.update_layout(barmode=\"group\")\n", - " fig.show()\n", - "\n", - "\n", - "def display_radar_plot(eval_results, title: str, metrics=None):\n", - " \"\"\"Plot the radar plot.\"\"\"\n", - " fig = go.Figure()\n", - " summary_metrics = eval_results.summary_metrics\n", - " if metrics:\n", - " summary_metrics = {\n", - " k: summary_metrics[k]\n", - " for k, v in summary_metrics.items()\n", - " if any(selected_metric in k for selected_metric in metrics)\n", - " }\n", - "\n", - " min_val = min(summary_metrics.values())\n", - " max_val = max(summary_metrics.values())\n", - "\n", - " fig.add_trace(\n", - " go.Scatterpolar(\n", - " r=list(summary_metrics.values()),\n", - " theta=list(summary_metrics.keys()),\n", - " fill=\"toself\",\n", - " name=title,\n", - " )\n", - " )\n", - " fig.update_layout(\n", - " title=title,\n", - " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", - " showlegend=True,\n", - " )\n", - " fig.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bDaa2Mtsifmq" - }, - "source": [ - "## Build CrewAI agent\n", - "\n", - "Build your application using CrewAI, including the Gemini model and custom tools that you define." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KHwShhpOitKp" - }, - "source": [ - "### Set tools\n", - "\n", - "To start, set the tools that a customer support agent needs to do their job." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gA2ZKvfeislw" - }, - "outputs": [], - "source": [ - "@tool\n", - "def get_product_details(product_name: str):\n", - " \"\"\"Gathers basic details about a product.\"\"\"\n", - " details = {\n", - " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", - " \"usb charger\": \"A super fast and light usb charger\",\n", - " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", - " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", - " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", - " }\n", - " return details.get(product_name, \"Product details not found.\")\n", - "\n", - "\n", - "@tool\n", - "def get_product_price(product_name: str):\n", - " \"\"\"Gathers price about a product.\"\"\"\n", - " details = {\n", - " \"smartphone\": 500,\n", - " \"usb charger\": 10,\n", - " \"shoes\": 100,\n", - " \"headphones\": 50,\n", - " \"speaker\": 80,\n", - " }\n", - " return details.get(product_name, \"Product price not found.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "be70714d9fae" - }, - "source": [ - "### Define router using Flow\n", - "\n", - "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "516b5108d327" - }, - "outputs": [], - "source": [ - "class ProductFlow(Flow):\n", - " @start\n", - " def begin_flow(self):\n", - " \"\"\"Starts the product information flow\"\"\"\n", - " return \"check_request\"\n", - "\n", - " @listen(\"check_request\")\n", - " def router(self, state: dict) -> str:\n", - " \"\"\"Routes the product request to appropriate handler\"\"\"\n", - " # Get the last message from the state\n", - " last_message = state.get(\"last_message\", {})\n", - " tool_calls = last_message.get(\"tool_calls\", [])\n", - "\n", - " if tool_calls:\n", - " function_name = tool_calls[0].get(\"name\")\n", - " if function_name == \"get_product_price\":\n", - " return \"get_product_price\"\n", - " else:\n", - " return \"get_product_details\"\n", - " return \"end\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "l4mk5XPui4Y1" - }, - "source": [ - "### Set the model\n", - "\n", - "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "BaYeo6K2i-w1" - }, - "outputs": [], - "source": [ - "model = \"vertex_ai/gemini-1.5-pro-002\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tNlAY9cojEWz" - }, - "source": [ - "### Assemble the agent\n", - "\n", - "The Vertex AI GenAI Evaluation works directly with 'Queryable' agents, and also lets you add your own custom functions with a specific structure (signature).\n", - "\n", - "In this case, you assemble the agent using a custom function. The function triggers the agent for a given input and parse the agent outcome to extract the response and called tools." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dAFdi7SujGP8" - }, - "outputs": [], - "source": [ - "def agent_parsed_outcome(input):\n", - " product_researcher = Agent(\n", - " role=\"Product Researcher\",\n", - " goal=\"Research product details and prices accurately\",\n", - " backstory=\"Expert at gathering and analyzing product information\",\n", - " llm=model,\n", - " tools=[get_product_details, get_product_price],\n", - " allow_delegation=False,\n", - " )\n", - "\n", - " # Create task based on the input\n", - " research_task = Task(\n", - " description=f\"Analyze this user request: '{input}'. \"\n", - " f\"If the request is about price, use get_product_price tool. \"\n", - " f\"Otherwise, use get_product_details tool to get product information.\",\n", - " expected_output=\"Product information including details and/or price based on the user request.\",\n", - " agent=product_researcher,\n", - " )\n", - "\n", - " # Create crew with sequential process\n", - " crew = Crew(\n", - " agents=[product_researcher],\n", - " tasks=[research_task],\n", - " process=Process.sequential,\n", - " )\n", - "\n", - " result = crew.kickoff()\n", - " return parse_crewai_output_to_dictionary(crew, result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_HGcs6PVjRj_" - }, - "source": [ - "### Test the agent\n", - "\n", - "Query your agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "lGb58OJkjUs9" - }, - "outputs": [], - "source": [ - "response = agent_parsed_outcome(input=\"Get product details for shoes\")\n", - "display(Markdown(format_output_as_markdown(response)))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2wCFstt8w4Dx" - }, - "outputs": [], - "source": [ - "response = agent_parsed_outcome(input=\"Get product price for shoes\")\n", - "display(Markdown(format_output_as_markdown(response)))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aOGPePsorpUl" - }, - "source": [ - "## Evaluating a CrewAI agent with Vertex AI Gen AI Evaluation\n", - "\n", - "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", - "\n", - "Monitoring focuses on how well your agent is performing specific tasks:\n", - "\n", - "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", - "\n", - "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", - "\n", - "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", - "\n", - "Observability is about understanding the overall health of the agent:\n", - "\n", - "* **Latency**: How long does it take the agent to respond?\n", - "\n", - "* **Failure Rate**: How often does the agent fail to produce a response?\n", - "\n", - "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e43229f3ad4f" - }, - "source": [ - "### Prepare Agent Evaluation dataset\n", - "\n", - "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", - "\n", - "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", - "\n", - "> Optionally, you can provide both generated responses and predicted trajectory (**Bring-Your-Own-Dataset scenario**).\n", - "\n", - "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fFf8uTdUiDt3" - }, - "outputs": [], - "source": [ - "eval_data = {\n", - " \"prompt\": [\n", - " \"Get price for smartphone\",\n", - " \"Get product details and price for headphones\",\n", - " \"Get details for usb charger\",\n", - " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\",\n", - " ],\n", - " \"reference_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - "}\n", - "\n", - "eval_sample_dataset = pd.DataFrame(eval_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PQEI1EcfvFHb" - }, - "source": [ - "Print some samples from the dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "EjsonqWWvIvE" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(eval_sample_dataset, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "m4CvBuf1afHG" - }, - "source": [ - "### Single tool usage evaluation\n", - "\n", - "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_rS5GGKHd5bx" - }, - "source": [ - "#### Set single tool usage metrics\n", - "\n", - "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", - "\n", - "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xixvq8dwd5by" - }, - "outputs": [], - "source": [ - "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ktKZoT2Qd5by" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using agent_parsed_outcome function and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SRv43fDcd5by" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"\n", - "\n", - "single_tool_call_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=single_tool_usage_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", - " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(single_tool_call_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6o5BjSTFKVMS" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Use some helper functions to visualize a sample of evaluation result." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1Jopzw83k14w" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JlujdJpu5Kn6" - }, - "source": [ - "### Trajectory Evaluation\n", - "\n", - "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8s-nHdDJneHM" - }, - "source": [ - "#### Set trajectory metrics\n", - "\n", - "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", - "\n", - "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", - "\n", - "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", - "\n", - "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", - "\n", - "* `trajectory_precision`: proportion of predicted actions present in reference\n", - "\n", - "* `trajectory_recall`: proportion of reference actions present in predicted. \n", - "\n", - "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "c32WIS95neHN" - }, - "outputs": [], - "source": [ - "trajectory_metrics = [\n", - " \"trajectory_exact_match\",\n", - " \"trajectory_in_order_match\",\n", - " \"trajectory_any_order_match\",\n", - " \"trajectory_precision\",\n", - " \"trajectory_recall\",\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF3jhTH3neHN" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vOdS7TJUneHN" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"trajectory-{get_id()}\"\n", - "\n", - "trajectory_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", - ")\n", - "\n", - "trajectory_eval_result = trajectory_eval_task.evaluate(\n", - " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(trajectory_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DBiUI3LyLBtj" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Print and visualize a sample of evaluation results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "z7-LdM3mLBtk" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sLVRdN5llA0h" - }, - "outputs": [], - "source": [ - "plot_bar_plot(\n", - " trajectory_eval_result,\n", - " title=\"Trajectory Metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "T8TipU2akHEd" - }, - "source": [ - "### Evaluate final response\n", - "\n", - "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DeK-py7ykkDN" - }, - "source": [ - "#### Set response metrics\n", - "\n", - "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", - "\n", - "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cyGHGgeVklvz" - }, - "outputs": [], - "source": [ - "response_metrics = [\"safety\", \"coherence\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DaBJWcg1kn55" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wRb2EC_hknSD" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"response-{get_id()}\"\n", - "\n", - "response_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", - ")\n", - "\n", - "response_eval_result = response_eval_task.evaluate(\n", - " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(response_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JtewTwiwg9qH" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "\n", - "Print new evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZODTRuq2lF75" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ntRBK3Te6PEc" - }, - "source": [ - "### Evaluate generated response conditioned by tool choosing\n", - "\n", - "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", - "\n", - "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4bENwFcd6prX" - }, - "source": [ - "#### Define a custom metric\n", - "\n", - "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", - "\n", - "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "txGEHcg76riI" - }, - "outputs": [], - "source": [ - "criteria = {\n", - " \"Follows trajectory\": (\n", - " \"Evaluate whether the agent's response logically follows from the \"\n", - " \"sequence of actions it took. Consider these sub-points:\\n\"\n", - " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", - " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", - " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", - " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", - " )\n", - "}\n", - "\n", - "pointwise_rating_rubric = {\n", - " \"1\": \"Follows trajectory\",\n", - " \"0\": \"Does not follow trajectory\",\n", - "}\n", - "\n", - "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", - " criteria=criteria,\n", - " rating_rubric=pointwise_rating_rubric,\n", - " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8MJqXu0kikxd" - }, - "source": [ - "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5EL7iEDMikNQ" - }, - "outputs": [], - "source": [ - "print(response_follows_trajectory_prompt_template.prompt_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e1djVp7Fi4Yy" - }, - "source": [ - "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nx1xbZD87iMj" - }, - "outputs": [], - "source": [ - "response_follows_trajectory_metric = PointwiseMetric(\n", - " metric=\"response_follows_trajectory\",\n", - " metric_prompt_template=response_follows_trajectory_prompt_template,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1pmxLwTe7Ywv" - }, - "source": [ - "#### Set response metrics\n", - "\n", - "Set new generated response evaluation metrics by including the custom metric.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wrsbVFDd7Ywv" - }, - "outputs": [], - "source": [ - "response_tool_metrics = [\n", - " \"trajectory_exact_match\",\n", - " \"trajectory_in_order_match\",\n", - " \"safety\",\n", - " response_follows_trajectory_metric,\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Lo-Sza807Ywv" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "Run a new agent's evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_dkb4gSn7Ywv" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"response-over-tools-{get_id()}\"\n", - "\n", - "response_eval_tool_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "response_eval_tool_result = response_eval_tool_task.evaluate(\n", - " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(response_eval_tool_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AtOfIFi2j88g" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Visualize evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GH2YvXgLlLH7" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(response_eval_tool_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tdVhCURXMdLG" - }, - "outputs": [], - "source": [ - "plot_bar_plot(\n", - " response_eval_tool_result,\n", - " title=\"Response Metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4nuUDP3a2eTB" - }, - "source": [ - "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", - "\n", - "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DRLKlmWd27PK" - }, - "source": [ - "### Bring your own evaluation dataset\n", - "\n", - "Define the evaluation dataset with the predicted trajectory and the generated response." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "y9hBgsg324Ej" - }, - "outputs": [], - "source": [ - "byod_eval_data = {\n", - " \"prompt\": [\n", - " \"Get price for smartphone\",\n", - " \"Get product details and price for headphones\",\n", - " \"Get details for usb charger\",\n", - " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\",\n", - " ],\n", - " \"reference_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - " \"generated_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - " \"response\": [\n", - " 500,\n", - " 50,\n", - " \"A super fast and light usb charger\",\n", - " 100,\n", - " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", - " ],\n", - "}\n", - "\n", - "byod_eval_sample_dataset = pd.DataFrame(eval_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oEYmU2eJ7q-1" - }, - "source": [ - "### Run an evaluation task\n", - "\n", - "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wBD-4wpB7q-3" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", - "\n", - "byod_response_eval_tool_task = EvalTask(\n", - " dataset=byod_eval_sample_dataset,\n", - " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", - " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN_NAME\n", - ")\n", - "\n", - "display_eval_report(byod_response_eval_tool_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9eU3LG6r7q-3" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Visualize evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pQFzmd2I7q-3" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "84HiPDOkPseW" - }, - "outputs": [], - "source": [ - "display_radar_plot(\n", - " byod_response_eval_tool_result,\n", - " title=\"Response Metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2a4e033321ad" - }, - "source": [ - "## Cleaning up\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Ox2I3UfRlTOd" - }, - "outputs": [], - "source": [ - "delete_experiment = True\n", - "\n", - "if delete_experiment:\n", - " try:\n", - " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", - " experiment.delete(delete_backing_tensorboard_runs=True)\n", - " except Exception as e:\n", - " print(e)" - ] - } - ], - "metadata": { - "colab": { - "name": "evaluating_crewai_agent.ipynb", - "toc_visible": true - }, - "environment": { - "kernel": "python3", - "name": "common-cpu.m126", - "type": "gcloud", - "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/base-cpu:m126" - }, - "kernelspec": { - "display_name": "Python 3 (Local)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 4 + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Evaluating Agents - Evaluate a CrewAI agent with Vertex AI Gen AI Evaluation\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Google
Open in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Open in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "Share to:\n", + "\n", + "\n", + " \"LinkedIn\n", + "\n", + "\n", + "\n", + " \"Bluesky\n", + "\n", + "\n", + "\n", + " \"X\n", + "\n", + "\n", + "\n", + " \"Reddit\n", + "\n", + "\n", + "\n", + " \"Facebook\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "84f0f73a0f76" + }, + "source": [ + "| | |\n", + "|-|-|\n", + "| Authors | [Ivan Nardini](https://github.com/inardini), [Naveksha Sood](https://github.com/navekshasood) |" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", + "\n", + "Vertex AI Model Evaluation provides a toolkit of quality controlled and explainable methods and metrics to evaluate any generative model or application, including agents, and benchmark the evaluation results against your own judgment, using your own evaluation criteria.\n", + "\n", + "This tutorial shows how to evaluate a Crew AI agent using Vertex AI Gen AI Evaluation for agent evaluation.\n", + "\n", + "The tutorial uses the following Google Cloud services and resources:\n", + "\n", + "* Vertex AI Gen AI Evaluation\n", + "\n", + "The steps performed include:\n", + "\n", + "* Build local agent using Crew AI\n", + "* Prepare Agent Evaluation dataset\n", + "* Single tool usage evaluation\n", + "* Trajectory evaluation\n", + "* Response evaluation\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "61RBz8LLbxCR" + }, + "source": [ + "## Get started" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "No17Cw5hgx12" + }, + "source": [ + "### Install Vertex AI SDK and other required packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tFy3H3aPgx12" + }, + "outputs": [], + "source": [ + "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation]\" \\\n", + " \"crewai\" \"crewai-tools\" \\\n", + " \"cloudpickle==3.0.0\" \\\n", + " \"pydantic==2.7.4\" \\\n", + " \"requests\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R5Xep4W9lq-Z" + }, + "source": [ + "### Restart runtime\n", + "\n", + "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", + "\n", + "The restart might take a minute or longer. After it's restarted, continue to the next step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XRvKdaPDTznN" + }, + "outputs": [], + "source": [ + "import IPython\n", + "\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SbmM4z7FOBpM" + }, + "source": [ + "
\n", + "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dmWOrTJ3gx13" + }, + "source": [ + "### Authenticate your notebook environment (Colab only)\n", + "\n", + "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NyKGtVQjgx13" + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " from google.colab import auth\n", + "\n", + " auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF4l8DTdWgPY" + }, + "source": [ + "### Set Google Cloud project information and initialize Vertex AI SDK\n", + "\n", + "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nqwi-5ufWp_B" + }, + "outputs": [], + "source": [ + "# Use the environment variable if the user doesn't provide Project ID.\n", + "import os\n", + "\n", + "import vertexai\n", + "\n", + "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", + "\n", + "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", + " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", + "os.environ[\"GOOGLE_CLOUD_PROJECT\"] = PROJECT_ID\n", + "\n", + "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", + "\n", + "EXPERIMENT_NAME = \"evaluate-crewai-agent\" # @param {type:\"string\"}\n", + "\n", + "vertexai.init(project=PROJECT_ID, location=LOCATION, experiment=EXPERIMENT_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5303c05f7aa6" + }, + "source": [ + "## Import libraries\n", + "\n", + "Import tutorial libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6fc324893334" + }, + "outputs": [], + "source": [ + "# General\n", + "import random\n", + "import string\n", + "import warnings\n", + "\n", + "from IPython.display import HTML, Markdown, display\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "\n", + "warnings.filterwarnings(\"ignore\", category=Warning, module=\"opentelemetry.trace\")\n", + "\n", + "# Build agent\n", + "from crewai import Agent, Crew, Process, Task\n", + "from crewai.flow.flow import Flow, listen, start\n", + "from crewai_tools import tool\n", + "\n", + "# Evaluate agent\n", + "from google.cloud import aiplatform\n", + "from vertexai.preview.evaluation import EvalTask\n", + "from vertexai.preview.evaluation.metrics import (\n", + " PointwiseMetric,\n", + " PointwiseMetricPromptTemplate,\n", + " TrajectorySingleToolUse,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MVnBDX54gz7j" + }, + "source": [ + "## Define helper functions\n", + "\n", + "Initiate a set of helper functions to print tutorial results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uSgWjMD_g1_v" + }, + "outputs": [], + "source": [ + "def get_id(length: int = 8) -> str:\n", + " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", + " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", + "\n", + "\n", + "def parse_crewai_output_to_dictionary(crew, crew_output):\n", + " \"\"\"\n", + " Parse CrewAI output into a structured dictionary format.\n", + " \"\"\"\n", + " final_output = {\"response\": str(crew_output), \"predicted_trajectory\": []}\n", + "\n", + " try:\n", + " # Access tools_results directly from each agent\n", + " for agent in crew.agents:\n", + " if hasattr(agent, \"tools_results\"):\n", + " for tool_result in agent.tools_results:\n", + " tool_info = {\n", + " \"tool_name\": tool_result.get(\"tool_name\", \"\"),\n", + " \"tool_input\": tool_result.get(\"tool_args\", {}),\n", + " }\n", + " final_output[\"predicted_trajectory\"].append(tool_info)\n", + "\n", + " except Exception as e:\n", + " final_output[\"error\"] = f\"Error parsing tools results: {str(e)}\"\n", + "\n", + " return final_output\n", + "\n", + "\n", + "def format_output_as_markdown(output: dict) -> str:\n", + " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", + " markdown = \"### AI Response\\n\"\n", + " markdown += f\"{output['response']}\\n\\n\"\n", + "\n", + " if output[\"predicted_trajectory\"]:\n", + " markdown += \"### Function Calls\\n\"\n", + " for call in output[\"predicted_trajectory\"]:\n", + " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", + " markdown += \" - **Arguments**:\\n\"\n", + " for key, value in call[\"tool_input\"].items():\n", + " markdown += f\" - `{key}`: `{value}`\\n\"\n", + "\n", + " return markdown\n", + "\n", + "\n", + "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", + " \"\"\"Display the evaluation results.\"\"\"\n", + " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", + " display(Markdown(\"### Summary Metrics\"))\n", + " display(metrics_df)\n", + "\n", + " display(Markdown(f\"### Row-wise Metrics\"))\n", + " display(eval_result.metrics_table)\n", + "\n", + "\n", + "def display_drilldown(row: pd.Series) -> None:\n", + " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", + "\n", + " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", + "\n", + " if not (\n", + " isinstance(row[\"predicted_trajectory\"], list)\n", + " and isinstance(row[\"reference_trajectory\"], list)\n", + " ):\n", + " return\n", + "\n", + " for predicted_trajectory, reference_trajectory in zip(\n", + " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", + " ):\n", + " display(\n", + " HTML(\n", + " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", + " )\n", + " )\n", + "\n", + " if not (\n", + " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", + " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", + " ):\n", + " continue\n", + "\n", + " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", + " print(\"Tool Input Key: \", tool_input_key)\n", + "\n", + " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " reference_trajectory[\"tool_input\"][tool_input_key],\n", + " )\n", + " else:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " \"N/A\",\n", + " )\n", + " print(\"\\n\")\n", + " display(HTML(\"
\"))\n", + "\n", + "\n", + "def display_dataframe_rows(\n", + " df: pd.DataFrame,\n", + " columns: list[str] | None = None,\n", + " num_rows: int = 3,\n", + " display_drilldown: bool = False,\n", + ") -> None:\n", + " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", + "\n", + " if columns:\n", + " df = df[columns]\n", + "\n", + " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", + " header_style = base_style + \"font-weight: bold;\"\n", + "\n", + " for _, row in df.head(num_rows).iterrows():\n", + " for column in df.columns:\n", + " display(\n", + " HTML(\n", + " f\"{column.replace('_', ' ').title()}: \"\n", + " )\n", + " )\n", + " display(HTML(f\"{row[column]}
\"))\n", + "\n", + " display(HTML(\"
\"))\n", + "\n", + " if (\n", + " display_drilldown\n", + " and \"predicted_trajectory\" in df.columns\n", + " and \"reference_trajectory\" in df.columns\n", + " ):\n", + " display_drilldown(row)\n", + "\n", + "\n", + "def plot_bar_plot(\n", + " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", + ") -> None:\n", + " fig = go.Figure()\n", + " data = []\n", + "\n", + " summary_metrics = eval_result.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " data.append(\n", + " go.Bar(\n", + " x=list(summary_metrics.keys()),\n", + " y=list(summary_metrics.values()),\n", + " name=title,\n", + " )\n", + " )\n", + "\n", + " fig = go.Figure(data=data)\n", + "\n", + " # Change the bar mode\n", + " fig.update_layout(barmode=\"group\")\n", + " fig.show()\n", + "\n", + "\n", + "def display_radar_plot(eval_results, title: str, metrics=None):\n", + " \"\"\"Plot the radar plot.\"\"\"\n", + " fig = go.Figure()\n", + " summary_metrics = eval_results.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " min_val = min(summary_metrics.values())\n", + " max_val = max(summary_metrics.values())\n", + "\n", + " fig.add_trace(\n", + " go.Scatterpolar(\n", + " r=list(summary_metrics.values()),\n", + " theta=list(summary_metrics.keys()),\n", + " fill=\"toself\",\n", + " name=title,\n", + " )\n", + " )\n", + " fig.update_layout(\n", + " title=title,\n", + " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", + " showlegend=True,\n", + " )\n", + " fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bDaa2Mtsifmq" + }, + "source": [ + "## Build CrewAI agent\n", + "\n", + "Build your application using CrewAI, including the Gemini model and custom tools that you define." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KHwShhpOitKp" + }, + "source": [ + "### Set tools\n", + "\n", + "To start, set the tools that a customer support agent needs to do their job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gA2ZKvfeislw" + }, + "outputs": [], + "source": [ + "@tool\n", + "def get_product_details(product_name: str):\n", + " \"\"\"Gathers basic details about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", + " \"usb charger\": \"A super fast and light usb charger\",\n", + " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", + " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", + " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " }\n", + " return details.get(product_name, \"Product details not found.\")\n", + "\n", + "\n", + "@tool\n", + "def get_product_price(product_name: str):\n", + " \"\"\"Gathers price about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": 500,\n", + " \"usb charger\": 10,\n", + " \"shoes\": 100,\n", + " \"headphones\": 50,\n", + " \"speaker\": 80,\n", + " }\n", + " return details.get(product_name, \"Product price not found.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "be70714d9fae" + }, + "source": [ + "### Define router using Flow\n", + "\n", + "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "516b5108d327" + }, + "outputs": [], + "source": [ + "class ProductFlow(Flow):\n", + " @start\n", + " def begin_flow(self):\n", + " \"\"\"Starts the product information flow\"\"\"\n", + " return \"check_request\"\n", + "\n", + " @listen(\"check_request\")\n", + " def router(self, state: dict) -> str:\n", + " \"\"\"Routes the product request to appropriate handler\"\"\"\n", + " # Get the last message from the state\n", + " last_message = state.get(\"last_message\", {})\n", + " tool_calls = last_message.get(\"tool_calls\", [])\n", + "\n", + " if tool_calls:\n", + " function_name = tool_calls[0].get(\"name\")\n", + " if function_name == \"get_product_price\":\n", + " return \"get_product_price\"\n", + " else:\n", + " return \"get_product_details\"\n", + " return \"end\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l4mk5XPui4Y1" + }, + "source": [ + "### Set the model\n", + "\n", + "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BaYeo6K2i-w1" + }, + "outputs": [], + "source": [ + "model = \"vertex_ai/gemini-1.5-pro-002\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNlAY9cojEWz" + }, + "source": [ + "### Assemble the agent\n", + "\n", + "The Vertex AI GenAI Evaluation works directly with 'Queryable' agents, and also lets you add your own custom functions with a specific structure (signature).\n", + "\n", + "In this case, you assemble the agent using a custom function. The function triggers the agent for a given input and parse the agent outcome to extract the response and called tools." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dAFdi7SujGP8" + }, + "outputs": [], + "source": [ + "def agent_parsed_outcome(input):\n", + " product_researcher = Agent(\n", + " role=\"Product Researcher\",\n", + " goal=\"Research product details and prices accurately\",\n", + " backstory=\"Expert at gathering and analyzing product information\",\n", + " llm=model,\n", + " tools=[get_product_details, get_product_price],\n", + " allow_delegation=False,\n", + " )\n", + "\n", + " # Create task based on the input\n", + " research_task = Task(\n", + " description=f\"Analyze this user request: '{input}'. \"\n", + " f\"If the request is about price, use get_product_price tool. \"\n", + " f\"Otherwise, use get_product_details tool to get product information.\",\n", + " expected_output=\"Product information including details and/or price based on the user request.\",\n", + " agent=product_researcher,\n", + " )\n", + "\n", + " # Create crew with sequential process\n", + " crew = Crew(\n", + " agents=[product_researcher],\n", + " tasks=[research_task],\n", + " process=Process.sequential,\n", + " )\n", + "\n", + " result = crew.kickoff()\n", + " return parse_crewai_output_to_dictionary(crew, result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_HGcs6PVjRj_" + }, + "source": [ + "### Test the agent\n", + "\n", + "Query your agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lGb58OJkjUs9" + }, + "outputs": [], + "source": [ + "response = agent_parsed_outcome(input=\"Get product details for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2wCFstt8w4Dx" + }, + "outputs": [], + "source": [ + "response = agent_parsed_outcome(input=\"Get product price for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aOGPePsorpUl" + }, + "source": [ + "## Evaluating a CrewAI agent with Vertex AI Gen AI Evaluation\n", + "\n", + "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", + "\n", + "Monitoring focuses on how well your agent is performing specific tasks:\n", + "\n", + "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", + "\n", + "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", + "\n", + "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", + "\n", + "Observability is about understanding the overall health of the agent:\n", + "\n", + "* **Latency**: How long does it take the agent to respond?\n", + "\n", + "* **Failure Rate**: How often does the agent fail to produce a response?\n", + "\n", + "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e43229f3ad4f" + }, + "source": [ + "### Prepare Agent Evaluation dataset\n", + "\n", + "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", + "\n", + "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", + "\n", + "> Optionally, you can provide both generated responses and predicted trajectory (**Bring-Your-Own-Dataset scenario**).\n", + "\n", + "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fFf8uTdUiDt3" + }, + "outputs": [], + "source": [ + "eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + "}\n", + "\n", + "eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PQEI1EcfvFHb" + }, + "source": [ + "Print some samples from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EjsonqWWvIvE" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(eval_sample_dataset, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m4CvBuf1afHG" + }, + "source": [ + "### Single tool usage evaluation\n", + "\n", + "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_rS5GGKHd5bx" + }, + "source": [ + "#### Set single tool usage metrics\n", + "\n", + "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", + "\n", + "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xixvq8dwd5by" + }, + "outputs": [], + "source": [ + "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ktKZoT2Qd5by" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using agent_parsed_outcome function and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SRv43fDcd5by" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"\n", + "\n", + "single_tool_call_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=single_tool_usage_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(single_tool_call_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6o5BjSTFKVMS" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Use some helper functions to visualize a sample of evaluation result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1Jopzw83k14w" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JlujdJpu5Kn6" + }, + "source": [ + "### Trajectory Evaluation\n", + "\n", + "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8s-nHdDJneHM" + }, + "source": [ + "#### Set trajectory metrics\n", + "\n", + "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", + "\n", + "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", + "\n", + "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", + "\n", + "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", + "\n", + "* `trajectory_precision`: proportion of predicted actions present in reference\n", + "\n", + "* `trajectory_recall`: proportion of reference actions present in predicted. \n", + "\n", + "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c32WIS95neHN" + }, + "outputs": [], + "source": [ + "trajectory_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"trajectory_any_order_match\",\n", + " \"trajectory_precision\",\n", + " \"trajectory_recall\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF3jhTH3neHN" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vOdS7TJUneHN" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"trajectory-{get_id()}\"\n", + "\n", + "trajectory_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "trajectory_eval_result = trajectory_eval_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(trajectory_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DBiUI3LyLBtj" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Print and visualize a sample of evaluation results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "z7-LdM3mLBtk" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sLVRdN5llA0h" + }, + "outputs": [], + "source": [ + "plot_bar_plot(\n", + " trajectory_eval_result,\n", + " title=\"Trajectory Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T8TipU2akHEd" + }, + "source": [ + "### Evaluate final response\n", + "\n", + "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DeK-py7ykkDN" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", + "\n", + "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cyGHGgeVklvz" + }, + "outputs": [], + "source": [ + "response_metrics = [\"safety\", \"coherence\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DaBJWcg1kn55" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wRb2EC_hknSD" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"response-{get_id()}\"\n", + "\n", + "response_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "response_eval_result = response_eval_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(response_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JtewTwiwg9qH" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZODTRuq2lF75" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ntRBK3Te6PEc" + }, + "source": [ + "### Evaluate generated response conditioned by tool choosing\n", + "\n", + "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", + "\n", + "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4bENwFcd6prX" + }, + "source": [ + "#### Define a custom metric\n", + "\n", + "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", + "\n", + "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "txGEHcg76riI" + }, + "outputs": [], + "source": [ + "criteria = {\n", + " \"Follows trajectory\": (\n", + " \"Evaluate whether the agent's response logically follows from the \"\n", + " \"sequence of actions it took. Consider these sub-points:\\n\"\n", + " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", + " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", + " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", + " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", + " )\n", + "}\n", + "\n", + "pointwise_rating_rubric = {\n", + " \"1\": \"Follows trajectory\",\n", + " \"0\": \"Does not follow trajectory\",\n", + "}\n", + "\n", + "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", + " criteria=criteria,\n", + " rating_rubric=pointwise_rating_rubric,\n", + " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8MJqXu0kikxd" + }, + "source": [ + "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5EL7iEDMikNQ" + }, + "outputs": [], + "source": [ + "print(response_follows_trajectory_prompt_template.prompt_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e1djVp7Fi4Yy" + }, + "source": [ + "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nx1xbZD87iMj" + }, + "outputs": [], + "source": [ + "response_follows_trajectory_metric = PointwiseMetric(\n", + " metric=\"response_follows_trajectory\",\n", + " metric_prompt_template=response_follows_trajectory_prompt_template,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1pmxLwTe7Ywv" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "Set new generated response evaluation metrics by including the custom metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wrsbVFDd7Ywv" + }, + "outputs": [], + "source": [ + "response_tool_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"safety\",\n", + " response_follows_trajectory_metric,\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lo-Sza807Ywv" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_dkb4gSn7Ywv" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"response-over-tools-{get_id()}\"\n", + "\n", + "response_eval_tool_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "response_eval_tool_result = response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AtOfIFi2j88g" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GH2YvXgLlLH7" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tdVhCURXMdLG" + }, + "outputs": [], + "source": [ + "plot_bar_plot(\n", + " response_eval_tool_result,\n", + " title=\"Response Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4nuUDP3a2eTB" + }, + "source": [ + "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", + "\n", + "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DRLKlmWd27PK" + }, + "source": [ + "### Bring your own evaluation dataset\n", + "\n", + "Define the evaluation dataset with the predicted trajectory and the generated response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y9hBgsg324Ej" + }, + "outputs": [], + "source": [ + "byod_eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"generated_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"response\": [\n", + " 500,\n", + " 50,\n", + " \"A super fast and light usb charger\",\n", + " 100,\n", + " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " ],\n", + "}\n", + "\n", + "byod_eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oEYmU2eJ7q-1" + }, + "source": [ + "### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wBD-4wpB7q-3" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", + "\n", + "byod_response_eval_tool_task = EvalTask(\n", + " dataset=byod_eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN_NAME\n", + ")\n", + "\n", + "display_eval_report(byod_response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9eU3LG6r7q-3" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pQFzmd2I7q-3" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "84HiPDOkPseW" + }, + "outputs": [], + "source": [ + "display_radar_plot(\n", + " byod_response_eval_tool_result,\n", + " title=\"Response Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2a4e033321ad" + }, + "source": [ + "## Cleaning up\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ox2I3UfRlTOd" + }, + "outputs": [], + "source": [ + "delete_experiment = True\n", + "\n", + "if delete_experiment:\n", + " try:\n", + " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", + " experiment.delete(delete_backing_tensorboard_runs=True)\n", + " except Exception as e:\n", + " print(e)" + ] + } + ], + "metadata": { + "colab": { + "name": "evaluating_crewai_agent.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/gemini/evaluation/evaluating_langgraph_agent.ipynb b/gemini/evaluation/evaluating_langgraph_agent.ipynb index c17a6ba09e..cb3ee72425 100644 --- a/gemini/evaluation/evaluating_langgraph_agent.ipynb +++ b/gemini/evaluation/evaluating_langgraph_agent.ipynb @@ -1,1580 +1,1561 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ur8xi4C7S06n" - }, - "outputs": [], - "source": [ - "# Copyright 2024 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JAPoU8Sm5E6e" - }, - "source": [ - "# Evaluating Agents - Evaluate an LangGraph agent with Vertex AI Gen AI Evaluation\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \"Google
Open in Colab\n", - "
\n", - "
\n", - " \n", - " \"Google
Open in Colab Enterprise\n", - "
\n", - "
\n", - " \n", - " \"Vertex
Open in Vertex AI Workbench\n", - "
\n", - "
\n", - " \n", - " \"GitHub
View on GitHub\n", - "
\n", - "
\n", - "\n", - "
\n", - "\n", - "Share to:\n", - "\n", - "\n", - " \"LinkedIn\n", - "\n", - "\n", - "\n", - " \"Bluesky\n", - "\n", - "\n", - "\n", - " \"X\n", - "\n", - "\n", - "\n", - " \"Reddit\n", - "\n", - "\n", - "\n", - " \"Facebook\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "84f0f73a0f76" - }, - "source": [ - "| | |\n", - "|-|-|\n", - "| Authors | [Ivan Nardini](https://github.com/inardini) [Naveksha Sood](https://github.com/navekshasood)|" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tvgnzT1CKxrO" - }, - "source": [ - "## Overview\n", - "\n", - "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", - "\n", - "Vertex AI Model Evaluation provides a toolkit of quality controlled and explainable methods and metrics to evaluate any generative model or application, including agents, and benchmark the evaluation results against your own judgment, using your own evaluation criteria.\n", - "\n", - "This tutorial shows how to evaluate a LangGraph agent using Vertex AI Gen AI Evaluation for agent evaluation.\n", - "\n", - "The tutorial uses the following Google Cloud services and resources:\n", - "\n", - "* Vertex AI Gen AI Evaluation\n", - "\n", - "The steps performed include:\n", - "\n", - "* Build local agent using LangGraph\n", - "* Prepare Agent Evaluation dataset\n", - "* Single tool usage evaluation\n", - "* Trajectory evaluation\n", - "* Response evaluation\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "61RBz8LLbxCR" - }, - "source": [ - "## Get started" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "No17Cw5hgx12" - }, - "source": [ - "### Install Vertex AI SDK and other required packages\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tFy3H3aPgx12" - }, - "outputs": [], - "source": [ - "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation]\" \\\n", - " \"langchain_google_vertexai\" \\\n", - " \"langgraph\" \\\n", - " \"cloudpickle==3.0.0\" \\\n", - " \"pydantic==2.7.4\" \\\n", - " \"requests\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "R5Xep4W9lq-Z" - }, - "source": [ - "### Restart runtime\n", - "\n", - "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", - "\n", - "The restart might take a minute or longer. After it's restarted, continue to the next step." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XRvKdaPDTznN" - }, - "outputs": [], - "source": [ - "import IPython\n", - "\n", - "app = IPython.Application.instance()\n", - "app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SbmM4z7FOBpM" - }, - "source": [ - "
\n", - "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", - "
\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dmWOrTJ3gx13" - }, - "source": [ - "### Authenticate your notebook environment (Colab only)\n", - "\n", - "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NyKGtVQjgx13" - }, - "outputs": [], - "source": [ - "import sys\n", - "\n", - "if \"google.colab\" in sys.modules:\n", - " from google.colab import auth\n", - "\n", - " auth.authenticate_user()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF4l8DTdWgPY" - }, - "source": [ - "### Set Google Cloud project information and initialize Vertex AI SDK\n", - "\n", - "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", - "\n", - "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nqwi-5ufWp_B" - }, - "outputs": [], - "source": [ - "# Use the environment variable if the user doesn't provide Project ID.\n", - "import os\n", - "\n", - "import vertexai\n", - "\n", - "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", - "\n", - "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", - " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", - "\n", - "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", - "\n", - "EXPERIMENT_NAME = \"evaluate-langgraph-agent\" # @param {type:\"string\"}\n", - "\n", - "vertexai.init(project=PROJECT_ID, location=LOCATION, experiment=EXPERIMENT_NAME)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5303c05f7aa6" - }, - "source": [ - "## Import libraries\n", - "\n", - "Import tutorial libraries." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6fc324893334" - }, - "outputs": [], - "source": [ - "# General\n", - "import random\n", - "import string\n", - "from typing import Literal\n", - "\n", - "from IPython.display import HTML, Markdown, display\n", - "\n", - "# Evaluate agent\n", - "from google.cloud import aiplatform\n", - "from langchain.load import dump as langchain_load_dump\n", - "\n", - "# Build agent\n", - "from langchain_core.messages import BaseMessage, HumanMessage\n", - "from langchain_core.tools import tool\n", - "from langchain_google_vertexai import ChatVertexAI\n", - "from langgraph.graph import END, MessageGraph\n", - "from langgraph.prebuilt import ToolNode\n", - "import pandas as pd\n", - "import plotly.graph_objects as go\n", - "from vertexai.preview.evaluation import EvalTask\n", - "from vertexai.preview.evaluation.metrics import (\n", - " PointwiseMetric,\n", - " PointwiseMetricPromptTemplate,\n", - " TrajectorySingleToolUse,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MVnBDX54gz7j" - }, - "source": [ - "## Define helper functions\n", - "\n", - "Initiate a set of helper functions to print tutorial results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uSgWjMD_g1_v" - }, - "outputs": [], - "source": [ - "def get_id(length: int = 8) -> str:\n", - " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", - " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", - "\n", - "\n", - "def parse_messages_to_output_dictionary(messages: list[dict]) -> dict:\n", - " \"\"\"Parse response and function calls from a list of messages in the constructor format.\"\"\"\n", - "\n", - " final_output = {\n", - " \"response\": \"No AI response found in the message history.\",\n", - " \"predicted_trajectory\": [],\n", - " }\n", - "\n", - " # Process each message\n", - " function_calls = []\n", - " for message in messages:\n", - " # Check if it's a Tool message which contains the actual response\n", - " if message.get(\"type\") == \"constructor\" and \"ToolMessage\" in message.get(\n", - " \"id\", []\n", - " ):\n", - " final_output[\"response\"] = message[\"kwargs\"][\"content\"]\n", - "\n", - " # Check if it's an AI message to get tool calls\n", - " elif message.get(\"type\") == \"constructor\" and \"AIMessage\" in message.get(\n", - " \"id\", []\n", - " ):\n", - " tool_calls = message[\"kwargs\"].get(\"tool_calls\", [])\n", - " for tool_call in tool_calls:\n", - " if tool_call:\n", - " function_calls.append(\n", - " {\n", - " \"tool_name\": tool_call.get(\"name\"),\n", - " \"tool_input\": tool_call.get(\"args\"),\n", - " }\n", - " )\n", - "\n", - " final_output[\"predicted_trajectory\"] = function_calls\n", - " return final_output\n", - "\n", - "\n", - "def format_output_as_markdown(output: dict) -> str:\n", - " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", - " markdown = \"### AI Response\\n\"\n", - " markdown += f\"{output['response']}\\n\\n\"\n", - "\n", - " if output[\"predicted_trajectory\"]:\n", - " markdown += \"### Function Calls\\n\"\n", - " for call in output[\"predicted_trajectory\"]:\n", - " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", - " markdown += \" - **Arguments**:\\n\"\n", - " for key, value in call[\"tool_input\"].items():\n", - " markdown += f\" - `{key}`: `{value}`\\n\"\n", - "\n", - " return markdown\n", - "\n", - "\n", - "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", - " \"\"\"Display the evaluation results.\"\"\"\n", - " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", - " display(Markdown(\"### Summary Metrics\"))\n", - " display(metrics_df)\n", - "\n", - " display(Markdown(f\"### Row-wise Metrics\"))\n", - " display(eval_result.metrics_table)\n", - "\n", - "\n", - "def display_drilldown(row: pd.Series) -> None:\n", - " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", - "\n", - " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", - "\n", - " if not (\n", - " isinstance(row[\"predicted_trajectory\"], list)\n", - " and isinstance(row[\"reference_trajectory\"], list)\n", - " ):\n", - " return\n", - "\n", - " for predicted_trajectory, reference_trajectory in zip(\n", - " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", - " ):\n", - " display(\n", - " HTML(\n", - " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", - " )\n", - " )\n", - "\n", - " if not (\n", - " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", - " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", - " ):\n", - " continue\n", - "\n", - " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", - " print(\"Tool Input Key: \", tool_input_key)\n", - "\n", - " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", - " print(\n", - " \"Tool Values: \",\n", - " predicted_trajectory[\"tool_input\"][tool_input_key],\n", - " reference_trajectory[\"tool_input\"][tool_input_key],\n", - " )\n", - " else:\n", - " print(\n", - " \"Tool Values: \",\n", - " predicted_trajectory[\"tool_input\"][tool_input_key],\n", - " \"N/A\",\n", - " )\n", - " print(\"\\n\")\n", - " display(HTML(\"
\"))\n", - "\n", - "\n", - "def display_dataframe_rows(\n", - " df: pd.DataFrame,\n", - " columns: list[str] | None = None,\n", - " num_rows: int = 3,\n", - " display_drilldown: bool = False,\n", - ") -> None:\n", - " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", - "\n", - " if columns:\n", - " df = df[columns]\n", - "\n", - " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", - " header_style = base_style + \"font-weight: bold;\"\n", - "\n", - " for _, row in df.head(num_rows).iterrows():\n", - " for column in df.columns:\n", - " display(\n", - " HTML(\n", - " f\"{column.replace('_', ' ').title()}: \"\n", - " )\n", - " )\n", - " display(HTML(f\"{row[column]}
\"))\n", - "\n", - " display(HTML(\"
\"))\n", - "\n", - " if (\n", - " display_drilldown\n", - " and \"predicted_trajectory\" in df.columns\n", - " and \"reference_trajectory\" in df.columns\n", - " ):\n", - " display_drilldown(row)\n", - "\n", - "\n", - "def plot_bar_plot(\n", - " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", - ") -> None:\n", - " fig = go.Figure()\n", - " data = []\n", - "\n", - " summary_metrics = eval_result.summary_metrics\n", - " if metrics:\n", - " summary_metrics = {\n", - " k: summary_metrics[k]\n", - " for k, v in summary_metrics.items()\n", - " if any(selected_metric in k for selected_metric in metrics)\n", - " }\n", - "\n", - " data.append(\n", - " go.Bar(\n", - " x=list(summary_metrics.keys()),\n", - " y=list(summary_metrics.values()),\n", - " name=title,\n", - " )\n", - " )\n", - "\n", - " fig = go.Figure(data=data)\n", - "\n", - " # Change the bar mode\n", - " fig.update_layout(barmode=\"group\")\n", - " fig.show()\n", - "\n", - "\n", - "def display_radar_plot(eval_results, title: str, metrics=None):\n", - " \"\"\"Plot the radar plot.\"\"\"\n", - " fig = go.Figure()\n", - " summary_metrics = eval_results.summary_metrics\n", - " if metrics:\n", - " summary_metrics = {\n", - " k: summary_metrics[k]\n", - " for k, v in summary_metrics.items()\n", - " if any(selected_metric in k for selected_metric in metrics)\n", - " }\n", - "\n", - " min_val = min(summary_metrics.values())\n", - " max_val = max(summary_metrics.values())\n", - "\n", - " fig.add_trace(\n", - " go.Scatterpolar(\n", - " r=list(summary_metrics.values()),\n", - " theta=list(summary_metrics.keys()),\n", - " fill=\"toself\",\n", - " name=title,\n", - " )\n", - " )\n", - " fig.update_layout(\n", - " title=title,\n", - " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", - " showlegend=True,\n", - " )\n", - " fig.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bDaa2Mtsifmq" - }, - "source": [ - "## Build LangGraph agent\n", - "\n", - "Build your application using LangGraph, including the Gemini model, custom tools that you define and a router to control the conversational flow." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KHwShhpOitKp" - }, - "source": [ - "### Set tools\n", - "\n", - "To start, set the tools that a customer support agent needs to do their job." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gA2ZKvfeislw" - }, - "outputs": [], - "source": [ - "@tool\n", - "def get_product_details(product_name: str):\n", - " \"\"\"Gathers basic details about a product.\"\"\"\n", - " details = {\n", - " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", - " \"usb charger\": \"A super fast and light usb charger\",\n", - " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", - " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", - " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", - " }\n", - " return details.get(product_name, \"Product details not found.\")\n", - "\n", - "\n", - "@tool\n", - "def get_product_price(product_name: str):\n", - " \"\"\"Gathers price about a product.\"\"\"\n", - " details = {\n", - " \"smartphone\": 500,\n", - " \"usb charger\": 10,\n", - " \"shoes\": 100,\n", - " \"headphones\": 50,\n", - " \"speaker\": 80,\n", - " }\n", - " return details.get(product_name, \"Product price not found.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "be70714d9fae" - }, - "source": [ - "### Define router\n", - "\n", - "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "516b5108d327" - }, - "outputs": [], - "source": [ - "def router(\n", - " state: list[BaseMessage],\n", - ") -> Literal[\"get_product_details\", \"get_product_price\", \"__end__\"]:\n", - " \"\"\"Initiates product details or price retrieval if the user asks for a product.\"\"\"\n", - " # Get the tool_calls from the last message in the conversation history.\n", - " tool_calls = state[-1].tool_calls\n", - "\n", - " # If there are any tool_calls\n", - " if tool_calls:\n", - " # Check the function name in the first tool call\n", - " function_name = tool_calls[0].get(\"name\")\n", - " if function_name == \"get_product_price\":\n", - " return \"get_product_price\"\n", - " else:\n", - " return \"get_product_details\"\n", - " else:\n", - " # End the conversation flow.\n", - " return \"__end__\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "l4mk5XPui4Y1" - }, - "source": [ - "### Set the model\n", - "\n", - "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "BaYeo6K2i-w1" - }, - "outputs": [], - "source": [ - "model = \"gemini-1.5-pro\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tNlAY9cojEWz" - }, - "source": [ - "### Assemble the agent\n", - "\n", - "The Vertex AI GenAI Evaluation works directly with 'Queryable' agents, and also lets you add your own custom functions with a specific structure (signature).\n", - "\n", - "In this case, you assemble the agent using a custom function. The function triggers the agent for a given input and parse the agent outcome to extract the response and called tools." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dAFdi7SujGP8" - }, - "outputs": [], - "source": [ - "def agent_parsed_outcome(input):\n", - "\n", - " model = ChatVertexAI(model=model)\n", - " builder = MessageGraph()\n", - "\n", - " model_with_tools = model.bind_tools([get_product_details, get_product_price])\n", - " builder.add_node(\"tools\", model_with_tools)\n", - "\n", - " tool_node = ToolNode([get_product_details, get_product_price])\n", - " builder.add_node(\"get_product_details\", tool_node)\n", - " builder.add_node(\"get_product_price\", tool_node)\n", - " builder.add_edge(\"get_product_details\", END)\n", - " builder.add_edge(\"get_product_price\", END)\n", - "\n", - " builder.set_entry_point(\"tools\")\n", - " builder.add_conditional_edges(\"tools\", router)\n", - "\n", - " app = builder.compile()\n", - " chat_history = langchain_load_dump.dumpd(app.invoke(HumanMessage(input)))\n", - " return parse_messages_to_output_dictionary(chat_history)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_HGcs6PVjRj_" - }, - "source": [ - "### Test the agent\n", - "\n", - "Query your agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "lGb58OJkjUs9" - }, - "outputs": [], - "source": [ - "response = agent_parsed_outcome(input=\"Get product details for shoes\")\n", - "display(Markdown(format_output_as_markdown(response)))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2wCFstt8w4Dx" - }, - "outputs": [], - "source": [ - "response = agent_parsed_outcome(input=\"Get product price for shoes\")\n", - "display(Markdown(format_output_as_markdown(response)))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aOGPePsorpUl" - }, - "source": [ - "## Evaluating a LangGraph agent with Vertex AI Gen AI Evaluation\n", - "\n", - "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", - "\n", - "Monitoring focuses on how well your agent is performing specific tasks:\n", - "\n", - "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", - "\n", - "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", - "\n", - "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", - "\n", - "Observability is about understanding the overall health of the agent:\n", - "\n", - "* **Latency**: How long does it take the agent to respond?\n", - "\n", - "* **Failure Rate**: How often does the agent fail to produce a response?\n", - "\n", - "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e43229f3ad4f" - }, - "source": [ - "### Prepare Agent Evaluation dataset\n", - "\n", - "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", - "\n", - "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", - "\n", - "> Optionally, you can provide both generated responses and predicted trajectory (**Bring-Your-Own-Dataset scenario**).\n", - "\n", - "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fFf8uTdUiDt3" - }, - "outputs": [], - "source": [ - "eval_data = {\n", - " \"prompt\": [\n", - " \"Get price for smartphone\",\n", - " \"Get product details and price for headphones\",\n", - " \"Get details for usb charger\",\n", - " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\",\n", - " ],\n", - " \"reference_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - "}\n", - "\n", - "eval_sample_dataset = pd.DataFrame(eval_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PQEI1EcfvFHb" - }, - "source": [ - "Print some samples from the dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "EjsonqWWvIvE" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(eval_sample_dataset, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "m4CvBuf1afHG" - }, - "source": [ - "### Single tool usage evaluation\n", - "\n", - "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_rS5GGKHd5bx" - }, - "source": [ - "#### Set single tool usage metrics\n", - "\n", - "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", - "\n", - "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xixvq8dwd5by" - }, - "outputs": [], - "source": [ - "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ktKZoT2Qd5by" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using agent_parsed_outcome function and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SRv43fDcd5by" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"\n", - "\n", - "single_tool_call_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=single_tool_usage_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", - " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(single_tool_call_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6o5BjSTFKVMS" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Use some helper functions to visualize a sample of evaluation result." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1Jopzw83k14w" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JlujdJpu5Kn6" - }, - "source": [ - "### Trajectory Evaluation\n", - "\n", - "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8s-nHdDJneHM" - }, - "source": [ - "#### Set trajectory metrics\n", - "\n", - "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", - "\n", - "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", - "\n", - "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", - "\n", - "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", - "\n", - "* `trajectory_precision`: proportion of predicted actions present in reference\n", - "\n", - "* `trajectory_recall`: proportion of reference actions present in predicted. \n", - "\n", - "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "c32WIS95neHN" - }, - "outputs": [], - "source": [ - "trajectory_metrics = [\n", - " \"trajectory_exact_match\",\n", - " \"trajectory_in_order_match\",\n", - " \"trajectory_any_order_match\",\n", - " \"trajectory_precision\",\n", - " \"trajectory_recall\",\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF3jhTH3neHN" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vOdS7TJUneHN" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"trajectory-{get_id()}\"\n", - "\n", - "trajectory_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", - ")\n", - "\n", - "trajectory_eval_result = trajectory_eval_task.evaluate(\n", - " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(trajectory_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DBiUI3LyLBtj" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Print and visualize a sample of evaluation results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sLVRdN5llA0h" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "erYYZEaaTNjJ" - }, - "outputs": [], - "source": [ - "plot_bar_plot(\n", - " trajectory_eval_result,\n", - " title=\"Trajectory Metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "T8TipU2akHEd" - }, - "source": [ - "### Evaluate final response\n", - "\n", - "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DeK-py7ykkDN" - }, - "source": [ - "#### Set response metrics\n", - "\n", - "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", - "\n", - "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cyGHGgeVklvz" - }, - "outputs": [], - "source": [ - "response_metrics = [\"safety\", \"coherence\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DaBJWcg1kn55" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wRb2EC_hknSD" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"response-{get_id()}\"\n", - "\n", - "response_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", - ")\n", - "\n", - "response_eval_result = response_eval_task.evaluate(\n", - " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(response_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WOP9hW-rTUIU" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "\n", - "Print new evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZODTRuq2lF75" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ntRBK3Te6PEc" - }, - "source": [ - "### Evaluate generated response conditioned by tool choosing\n", - "\n", - "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", - "\n", - "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4bENwFcd6prX" - }, - "source": [ - "#### Define a custom metric\n", - "\n", - "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", - "\n", - "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "txGEHcg76riI" - }, - "outputs": [], - "source": [ - "criteria = {\n", - " \"Follows trajectory\": (\n", - " \"Evaluate whether the agent's response logically follows from the \"\n", - " \"sequence of actions it took. Consider these sub-points:\\n\"\n", - " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", - " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", - " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", - " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", - " )\n", - "}\n", - "\n", - "pointwise_rating_rubric = {\n", - " \"1\": \"Follows trajectory\",\n", - " \"0\": \"Does not follow trajectory\",\n", - "}\n", - "\n", - "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", - " criteria=criteria,\n", - " rating_rubric=pointwise_rating_rubric,\n", - " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8MJqXu0kikxd" - }, - "source": [ - "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5EL7iEDMikNQ" - }, - "outputs": [], - "source": [ - "print(response_follows_trajectory_prompt_template.prompt_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e1djVp7Fi4Yy" - }, - "source": [ - "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nx1xbZD87iMj" - }, - "outputs": [], - "source": [ - "response_follows_trajectory_metric = PointwiseMetric(\n", - " metric=\"response_follows_trajectory\",\n", - " metric_prompt_template=response_follows_trajectory_prompt_template,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1pmxLwTe7Ywv" - }, - "source": [ - "#### Set response metrics\n", - "\n", - "Set new generated response evaluation metrics by including the custom metric.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wrsbVFDd7Ywv" - }, - "outputs": [], - "source": [ - "response_tool_metrics = [\n", - " \"trajectory_exact_match\",\n", - " \"trajectory_in_order_match\",\n", - " \"safety\",\n", - " response_follows_trajectory_metric,\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Lo-Sza807Ywv" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "Run a new agent's evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_dkb4gSn7Ywv" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"response-over-tools-{get_id()}\"\n", - "\n", - "response_eval_tool_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "response_eval_tool_result = response_eval_tool_task.evaluate(\n", - " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(response_eval_tool_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AtOfIFi2j88g" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Visualize evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GH2YvXgLlLH7" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(response_eval_tool_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4nuUDP3a2eTB" - }, - "source": [ - "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", - "\n", - "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DRLKlmWd27PK" - }, - "source": [ - "### Bring your own evaluation dataset\n", - "\n", - "Define the evaluation dataset with the predicted trajectory and the generated response." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "y9hBgsg324Ej" - }, - "outputs": [], - "source": [ - "byod_eval_data = {\n", - " \"prompt\": [\n", - " \"Get price for smartphone\",\n", - " \"Get product details and price for headphones\",\n", - " \"Get details for usb charger\",\n", - " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\",\n", - " ],\n", - " \"reference_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - " \"generated_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - " \"response\": [\n", - " 500,\n", - " 50,\n", - " \"A super fast and light usb charger\",\n", - " 100,\n", - " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", - " ],\n", - "}\n", - "\n", - "byod_eval_sample_dataset = pd.DataFrame(eval_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oEYmU2eJ7q-1" - }, - "source": [ - "### Run an evaluation task\n", - "\n", - "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wBD-4wpB7q-3" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", - "\n", - "byod_response_eval_tool_task = EvalTask(\n", - " dataset=byod_eval_sample_dataset,\n", - " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", - " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN_NAME\n", - ")\n", - "\n", - "display_eval_report(byod_response_eval_tool_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9eU3LG6r7q-3" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Visualize evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pQFzmd2I7q-3" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "DJr8GqQKTpUa" - }, - "outputs": [], - "source": [ - "display_radar_plot(\n", - " byod_response_eval_tool_result,\n", - " title=\"Response Metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2a4e033321ad" - }, - "source": [ - "## Cleaning up\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Ox2I3UfRlTOd" - }, - "outputs": [], - "source": [ - "delete_experiment = True\n", - "\n", - "if delete_experiment:\n", - " try:\n", - " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", - " experiment.delete(delete_backing_tensorboard_runs=True)\n", - " except Exception as e:\n", - " print(e)" - ] - } - ], - "metadata": { - "colab": { - "name": "evaluating_langgraph_agent.ipynb", - "toc_visible": true - }, - "environment": { - "kernel": "python3", - "name": "common-cpu.m126", - "type": "gcloud", - "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/base-cpu:m126" - }, - "kernelspec": { - "display_name": "Python 3 (Local)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 4 + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Evaluating Agents - Evaluate an LangGraph agent with Vertex AI Gen AI Evaluation\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Google
Open in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Open in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "Share to:\n", + "\n", + "\n", + " \"LinkedIn\n", + "\n", + "\n", + "\n", + " \"Bluesky\n", + "\n", + "\n", + "\n", + " \"X\n", + "\n", + "\n", + "\n", + " \"Reddit\n", + "\n", + "\n", + "\n", + " \"Facebook\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "84f0f73a0f76" + }, + "source": [ + "| | |\n", + "|-|-|\n", + "| Authors | [Ivan Nardini](https://github.com/inardini) [Naveksha Sood](https://github.com/navekshasood)|" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", + "\n", + "Vertex AI Model Evaluation provides a toolkit of quality controlled and explainable methods and metrics to evaluate any generative model or application, including agents, and benchmark the evaluation results against your own judgment, using your own evaluation criteria.\n", + "\n", + "This tutorial shows how to evaluate a LangGraph agent using Vertex AI Gen AI Evaluation for agent evaluation.\n", + "\n", + "The tutorial uses the following Google Cloud services and resources:\n", + "\n", + "* Vertex AI Gen AI Evaluation\n", + "\n", + "The steps performed include:\n", + "\n", + "* Build local agent using LangGraph\n", + "* Prepare Agent Evaluation dataset\n", + "* Single tool usage evaluation\n", + "* Trajectory evaluation\n", + "* Response evaluation\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "61RBz8LLbxCR" + }, + "source": [ + "## Get started" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "No17Cw5hgx12" + }, + "source": [ + "### Install Vertex AI SDK and other required packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tFy3H3aPgx12" + }, + "outputs": [], + "source": [ + "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation]\" \\\n", + " \"langchain_google_vertexai\" \\\n", + " \"langgraph\" \\\n", + " \"cloudpickle==3.0.0\" \\\n", + " \"pydantic==2.7.4\" \\\n", + " \"requests\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R5Xep4W9lq-Z" + }, + "source": [ + "### Restart runtime\n", + "\n", + "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", + "\n", + "The restart might take a minute or longer. After it's restarted, continue to the next step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XRvKdaPDTznN" + }, + "outputs": [], + "source": [ + "import IPython\n", + "\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SbmM4z7FOBpM" + }, + "source": [ + "
\n", + "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dmWOrTJ3gx13" + }, + "source": [ + "### Authenticate your notebook environment (Colab only)\n", + "\n", + "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NyKGtVQjgx13" + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " from google.colab import auth\n", + "\n", + " auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF4l8DTdWgPY" + }, + "source": [ + "### Set Google Cloud project information and initialize Vertex AI SDK\n", + "\n", + "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nqwi-5ufWp_B" + }, + "outputs": [], + "source": [ + "# Use the environment variable if the user doesn't provide Project ID.\n", + "import os\n", + "\n", + "import vertexai\n", + "\n", + "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", + "\n", + "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", + " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", + "\n", + "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", + "\n", + "EXPERIMENT_NAME = \"evaluate-langgraph-agent\" # @param {type:\"string\"}\n", + "\n", + "vertexai.init(project=PROJECT_ID, location=LOCATION, experiment=EXPERIMENT_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5303c05f7aa6" + }, + "source": [ + "## Import libraries\n", + "\n", + "Import tutorial libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6fc324893334" + }, + "outputs": [], + "source": [ + "# General\n", + "import random\n", + "import string\n", + "from typing import Literal\n", + "\n", + "from IPython.display import HTML, Markdown, display\n", + "\n", + "# Evaluate agent\n", + "from google.cloud import aiplatform\n", + "from langchain.load import dump as langchain_load_dump\n", + "\n", + "# Build agent\n", + "from langchain_core.messages import BaseMessage, HumanMessage\n", + "from langchain_core.tools import tool\n", + "from langchain_google_vertexai import ChatVertexAI\n", + "from langgraph.graph import END, MessageGraph\n", + "from langgraph.prebuilt import ToolNode\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "from vertexai.preview.evaluation import EvalTask\n", + "from vertexai.preview.evaluation.metrics import (\n", + " PointwiseMetric,\n", + " PointwiseMetricPromptTemplate,\n", + " TrajectorySingleToolUse,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MVnBDX54gz7j" + }, + "source": [ + "## Define helper functions\n", + "\n", + "Initiate a set of helper functions to print tutorial results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uSgWjMD_g1_v" + }, + "outputs": [], + "source": [ + "def get_id(length: int = 8) -> str:\n", + " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", + " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", + "\n", + "\n", + "def parse_messages_to_output_dictionary(messages: list[dict]) -> dict:\n", + " \"\"\"Parse response and function calls from a list of messages in the constructor format.\"\"\"\n", + "\n", + " final_output = {\n", + " \"response\": \"No AI response found in the message history.\",\n", + " \"predicted_trajectory\": [],\n", + " }\n", + "\n", + " # Process each message\n", + " function_calls = []\n", + " for message in messages:\n", + " # Check if it's a Tool message which contains the actual response\n", + " if message.get(\"type\") == \"constructor\" and \"ToolMessage\" in message.get(\n", + " \"id\", []\n", + " ):\n", + " final_output[\"response\"] = message[\"kwargs\"][\"content\"]\n", + "\n", + " # Check if it's an AI message to get tool calls\n", + " elif message.get(\"type\") == \"constructor\" and \"AIMessage\" in message.get(\n", + " \"id\", []\n", + " ):\n", + " tool_calls = message[\"kwargs\"].get(\"tool_calls\", [])\n", + " for tool_call in tool_calls:\n", + " if tool_call:\n", + " function_calls.append(\n", + " {\n", + " \"tool_name\": tool_call.get(\"name\"),\n", + " \"tool_input\": tool_call.get(\"args\"),\n", + " }\n", + " )\n", + "\n", + " final_output[\"predicted_trajectory\"] = function_calls\n", + " return final_output\n", + "\n", + "\n", + "def format_output_as_markdown(output: dict) -> str:\n", + " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", + " markdown = \"### AI Response\\n\"\n", + " markdown += f\"{output['response']}\\n\\n\"\n", + "\n", + " if output[\"predicted_trajectory\"]:\n", + " markdown += \"### Function Calls\\n\"\n", + " for call in output[\"predicted_trajectory\"]:\n", + " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", + " markdown += \" - **Arguments**:\\n\"\n", + " for key, value in call[\"tool_input\"].items():\n", + " markdown += f\" - `{key}`: `{value}`\\n\"\n", + "\n", + " return markdown\n", + "\n", + "\n", + "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", + " \"\"\"Display the evaluation results.\"\"\"\n", + " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", + " display(Markdown(\"### Summary Metrics\"))\n", + " display(metrics_df)\n", + "\n", + " display(Markdown(f\"### Row-wise Metrics\"))\n", + " display(eval_result.metrics_table)\n", + "\n", + "\n", + "def display_drilldown(row: pd.Series) -> None:\n", + " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", + "\n", + " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", + "\n", + " if not (\n", + " isinstance(row[\"predicted_trajectory\"], list)\n", + " and isinstance(row[\"reference_trajectory\"], list)\n", + " ):\n", + " return\n", + "\n", + " for predicted_trajectory, reference_trajectory in zip(\n", + " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", + " ):\n", + " display(\n", + " HTML(\n", + " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", + " )\n", + " )\n", + "\n", + " if not (\n", + " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", + " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", + " ):\n", + " continue\n", + "\n", + " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", + " print(\"Tool Input Key: \", tool_input_key)\n", + "\n", + " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " reference_trajectory[\"tool_input\"][tool_input_key],\n", + " )\n", + " else:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " \"N/A\",\n", + " )\n", + " print(\"\\n\")\n", + " display(HTML(\"
\"))\n", + "\n", + "\n", + "def display_dataframe_rows(\n", + " df: pd.DataFrame,\n", + " columns: list[str] | None = None,\n", + " num_rows: int = 3,\n", + " display_drilldown: bool = False,\n", + ") -> None:\n", + " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", + "\n", + " if columns:\n", + " df = df[columns]\n", + "\n", + " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", + " header_style = base_style + \"font-weight: bold;\"\n", + "\n", + " for _, row in df.head(num_rows).iterrows():\n", + " for column in df.columns:\n", + " display(\n", + " HTML(\n", + " f\"{column.replace('_', ' ').title()}: \"\n", + " )\n", + " )\n", + " display(HTML(f\"{row[column]}
\"))\n", + "\n", + " display(HTML(\"
\"))\n", + "\n", + " if (\n", + " display_drilldown\n", + " and \"predicted_trajectory\" in df.columns\n", + " and \"reference_trajectory\" in df.columns\n", + " ):\n", + " display_drilldown(row)\n", + "\n", + "\n", + "def plot_bar_plot(\n", + " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", + ") -> None:\n", + " fig = go.Figure()\n", + " data = []\n", + "\n", + " summary_metrics = eval_result.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " data.append(\n", + " go.Bar(\n", + " x=list(summary_metrics.keys()),\n", + " y=list(summary_metrics.values()),\n", + " name=title,\n", + " )\n", + " )\n", + "\n", + " fig = go.Figure(data=data)\n", + "\n", + " # Change the bar mode\n", + " fig.update_layout(barmode=\"group\")\n", + " fig.show()\n", + "\n", + "\n", + "def display_radar_plot(eval_results, title: str, metrics=None):\n", + " \"\"\"Plot the radar plot.\"\"\"\n", + " fig = go.Figure()\n", + " summary_metrics = eval_results.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " min_val = min(summary_metrics.values())\n", + " max_val = max(summary_metrics.values())\n", + "\n", + " fig.add_trace(\n", + " go.Scatterpolar(\n", + " r=list(summary_metrics.values()),\n", + " theta=list(summary_metrics.keys()),\n", + " fill=\"toself\",\n", + " name=title,\n", + " )\n", + " )\n", + " fig.update_layout(\n", + " title=title,\n", + " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", + " showlegend=True,\n", + " )\n", + " fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bDaa2Mtsifmq" + }, + "source": [ + "## Build LangGraph agent\n", + "\n", + "Build your application using LangGraph, including the Gemini model, custom tools that you define and a router to control the conversational flow." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KHwShhpOitKp" + }, + "source": [ + "### Set tools\n", + "\n", + "To start, set the tools that a customer support agent needs to do their job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gA2ZKvfeislw" + }, + "outputs": [], + "source": [ + "@tool\n", + "def get_product_details(product_name: str):\n", + " \"\"\"Gathers basic details about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", + " \"usb charger\": \"A super fast and light usb charger\",\n", + " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", + " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", + " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " }\n", + " return details.get(product_name, \"Product details not found.\")\n", + "\n", + "\n", + "@tool\n", + "def get_product_price(product_name: str):\n", + " \"\"\"Gathers price about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": 500,\n", + " \"usb charger\": 10,\n", + " \"shoes\": 100,\n", + " \"headphones\": 50,\n", + " \"speaker\": 80,\n", + " }\n", + " return details.get(product_name, \"Product price not found.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "be70714d9fae" + }, + "source": [ + "### Define router\n", + "\n", + "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "516b5108d327" + }, + "outputs": [], + "source": [ + "def router(\n", + " state: list[BaseMessage],\n", + ") -> Literal[\"get_product_details\", \"get_product_price\", \"__end__\"]:\n", + " \"\"\"Initiates product details or price retrieval if the user asks for a product.\"\"\"\n", + " # Get the tool_calls from the last message in the conversation history.\n", + " tool_calls = state[-1].tool_calls\n", + "\n", + " # If there are any tool_calls\n", + " if tool_calls:\n", + " # Check the function name in the first tool call\n", + " function_name = tool_calls[0].get(\"name\")\n", + " if function_name == \"get_product_price\":\n", + " return \"get_product_price\"\n", + " else:\n", + " return \"get_product_details\"\n", + " else:\n", + " # End the conversation flow.\n", + " return \"__end__\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l4mk5XPui4Y1" + }, + "source": [ + "### Set the model\n", + "\n", + "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BaYeo6K2i-w1" + }, + "outputs": [], + "source": [ + "model = \"gemini-1.5-pro\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNlAY9cojEWz" + }, + "source": [ + "### Assemble the agent\n", + "\n", + "The Vertex AI GenAI Evaluation works directly with 'Queryable' agents, and also lets you add your own custom functions with a specific structure (signature).\n", + "\n", + "In this case, you assemble the agent using a custom function. The function triggers the agent for a given input and parse the agent outcome to extract the response and called tools." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dAFdi7SujGP8" + }, + "outputs": [], + "source": [ + "def agent_parsed_outcome(input):\n", + "\n", + " model = ChatVertexAI(model=model)\n", + " builder = MessageGraph()\n", + "\n", + " model_with_tools = model.bind_tools([get_product_details, get_product_price])\n", + " builder.add_node(\"tools\", model_with_tools)\n", + "\n", + " tool_node = ToolNode([get_product_details, get_product_price])\n", + " builder.add_node(\"get_product_details\", tool_node)\n", + " builder.add_node(\"get_product_price\", tool_node)\n", + " builder.add_edge(\"get_product_details\", END)\n", + " builder.add_edge(\"get_product_price\", END)\n", + "\n", + " builder.set_entry_point(\"tools\")\n", + " builder.add_conditional_edges(\"tools\", router)\n", + "\n", + " app = builder.compile()\n", + " chat_history = langchain_load_dump.dumpd(app.invoke(HumanMessage(input)))\n", + " return parse_messages_to_output_dictionary(chat_history)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_HGcs6PVjRj_" + }, + "source": [ + "### Test the agent\n", + "\n", + "Query your agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lGb58OJkjUs9" + }, + "outputs": [], + "source": [ + "response = agent_parsed_outcome(input=\"Get product details for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2wCFstt8w4Dx" + }, + "outputs": [], + "source": [ + "response = agent_parsed_outcome(input=\"Get product price for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aOGPePsorpUl" + }, + "source": [ + "## Evaluating a LangGraph agent with Vertex AI Gen AI Evaluation\n", + "\n", + "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", + "\n", + "Monitoring focuses on how well your agent is performing specific tasks:\n", + "\n", + "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", + "\n", + "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", + "\n", + "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", + "\n", + "Observability is about understanding the overall health of the agent:\n", + "\n", + "* **Latency**: How long does it take the agent to respond?\n", + "\n", + "* **Failure Rate**: How often does the agent fail to produce a response?\n", + "\n", + "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e43229f3ad4f" + }, + "source": [ + "### Prepare Agent Evaluation dataset\n", + "\n", + "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", + "\n", + "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", + "\n", + "> Optionally, you can provide both generated responses and predicted trajectory (**Bring-Your-Own-Dataset scenario**).\n", + "\n", + "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fFf8uTdUiDt3" + }, + "outputs": [], + "source": [ + "eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + "}\n", + "\n", + "eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PQEI1EcfvFHb" + }, + "source": [ + "Print some samples from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EjsonqWWvIvE" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(eval_sample_dataset, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m4CvBuf1afHG" + }, + "source": [ + "### Single tool usage evaluation\n", + "\n", + "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_rS5GGKHd5bx" + }, + "source": [ + "#### Set single tool usage metrics\n", + "\n", + "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", + "\n", + "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xixvq8dwd5by" + }, + "outputs": [], + "source": [ + "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ktKZoT2Qd5by" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using agent_parsed_outcome function and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SRv43fDcd5by" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"\n", + "\n", + "single_tool_call_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=single_tool_usage_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(single_tool_call_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6o5BjSTFKVMS" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Use some helper functions to visualize a sample of evaluation result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1Jopzw83k14w" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JlujdJpu5Kn6" + }, + "source": [ + "### Trajectory Evaluation\n", + "\n", + "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8s-nHdDJneHM" + }, + "source": [ + "#### Set trajectory metrics\n", + "\n", + "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", + "\n", + "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", + "\n", + "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", + "\n", + "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", + "\n", + "* `trajectory_precision`: proportion of predicted actions present in reference\n", + "\n", + "* `trajectory_recall`: proportion of reference actions present in predicted. \n", + "\n", + "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c32WIS95neHN" + }, + "outputs": [], + "source": [ + "trajectory_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"trajectory_any_order_match\",\n", + " \"trajectory_precision\",\n", + " \"trajectory_recall\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF3jhTH3neHN" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vOdS7TJUneHN" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"trajectory-{get_id()}\"\n", + "\n", + "trajectory_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "trajectory_eval_result = trajectory_eval_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(trajectory_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DBiUI3LyLBtj" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Print and visualize a sample of evaluation results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sLVRdN5llA0h" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "erYYZEaaTNjJ" + }, + "outputs": [], + "source": [ + "plot_bar_plot(\n", + " trajectory_eval_result,\n", + " title=\"Trajectory Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T8TipU2akHEd" + }, + "source": [ + "### Evaluate final response\n", + "\n", + "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DeK-py7ykkDN" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", + "\n", + "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cyGHGgeVklvz" + }, + "outputs": [], + "source": [ + "response_metrics = [\"safety\", \"coherence\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DaBJWcg1kn55" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wRb2EC_hknSD" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"response-{get_id()}\"\n", + "\n", + "response_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "response_eval_result = response_eval_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(response_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WOP9hW-rTUIU" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZODTRuq2lF75" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ntRBK3Te6PEc" + }, + "source": [ + "### Evaluate generated response conditioned by tool choosing\n", + "\n", + "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", + "\n", + "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4bENwFcd6prX" + }, + "source": [ + "#### Define a custom metric\n", + "\n", + "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", + "\n", + "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "txGEHcg76riI" + }, + "outputs": [], + "source": [ + "criteria = {\n", + " \"Follows trajectory\": (\n", + " \"Evaluate whether the agent's response logically follows from the \"\n", + " \"sequence of actions it took. Consider these sub-points:\\n\"\n", + " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", + " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", + " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", + " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", + " )\n", + "}\n", + "\n", + "pointwise_rating_rubric = {\n", + " \"1\": \"Follows trajectory\",\n", + " \"0\": \"Does not follow trajectory\",\n", + "}\n", + "\n", + "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", + " criteria=criteria,\n", + " rating_rubric=pointwise_rating_rubric,\n", + " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8MJqXu0kikxd" + }, + "source": [ + "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5EL7iEDMikNQ" + }, + "outputs": [], + "source": [ + "print(response_follows_trajectory_prompt_template.prompt_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e1djVp7Fi4Yy" + }, + "source": [ + "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nx1xbZD87iMj" + }, + "outputs": [], + "source": [ + "response_follows_trajectory_metric = PointwiseMetric(\n", + " metric=\"response_follows_trajectory\",\n", + " metric_prompt_template=response_follows_trajectory_prompt_template,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1pmxLwTe7Ywv" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "Set new generated response evaluation metrics by including the custom metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wrsbVFDd7Ywv" + }, + "outputs": [], + "source": [ + "response_tool_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"safety\",\n", + " response_follows_trajectory_metric,\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lo-Sza807Ywv" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_dkb4gSn7Ywv" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"response-over-tools-{get_id()}\"\n", + "\n", + "response_eval_tool_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "response_eval_tool_result = response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AtOfIFi2j88g" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GH2YvXgLlLH7" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4nuUDP3a2eTB" + }, + "source": [ + "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", + "\n", + "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DRLKlmWd27PK" + }, + "source": [ + "### Bring your own evaluation dataset\n", + "\n", + "Define the evaluation dataset with the predicted trajectory and the generated response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y9hBgsg324Ej" + }, + "outputs": [], + "source": [ + "byod_eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"generated_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"response\": [\n", + " 500,\n", + " 50,\n", + " \"A super fast and light usb charger\",\n", + " 100,\n", + " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " ],\n", + "}\n", + "\n", + "byod_eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oEYmU2eJ7q-1" + }, + "source": [ + "### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wBD-4wpB7q-3" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", + "\n", + "byod_response_eval_tool_task = EvalTask(\n", + " dataset=byod_eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN_NAME\n", + ")\n", + "\n", + "display_eval_report(byod_response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9eU3LG6r7q-3" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pQFzmd2I7q-3" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DJr8GqQKTpUa" + }, + "outputs": [], + "source": [ + "display_radar_plot(\n", + " byod_response_eval_tool_result,\n", + " title=\"Response Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2a4e033321ad" + }, + "source": [ + "## Cleaning up\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ox2I3UfRlTOd" + }, + "outputs": [], + "source": [ + "delete_experiment = True\n", + "\n", + "if delete_experiment:\n", + " try:\n", + " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", + " experiment.delete(delete_backing_tensorboard_runs=True)\n", + " except Exception as e:\n", + " print(e)" + ] + } + ], + "metadata": { + "colab": { + "name": "evaluating_langgraph_agent.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb b/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb index 645877010d..10be4b26fb 100644 --- a/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb +++ b/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb @@ -1,1715 +1,1696 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ur8xi4C7S06n" - }, - "outputs": [], - "source": [ - "# Copyright 2024 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JAPoU8Sm5E6e" - }, - "source": [ - "# Evaluate an CrewAI agent on Vertex AI Reasoning Engine (Customized template)\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \"Google
Open in Colab\n", - "
\n", - "
\n", - " \n", - " \"Google
Open in Colab Enterprise\n", - "
\n", - "
\n", - " \n", - " \"Vertex
Open in Vertex AI Workbench\n", - "
\n", - "
\n", - " \n", - " \"GitHub
View on GitHub\n", - "
\n", - "
\n", - "\n", - "
\n", - "\n", - "Share to:\n", - "\n", - "\n", - " \"LinkedIn\n", - "\n", - "\n", - "\n", - " \"Bluesky\n", - "\n", - "\n", - "\n", - " \"X\n", - "\n", - "\n", - "\n", - " \"Reddit\n", - "\n", - "\n", - "\n", - " \"Facebook\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "84f0f73a0f76" - }, - "source": [ - "| | |\n", - "|-|-|\n", - "| Authors | [Naveksha Sood](https://github.com/navekshasood) [Ivan Nardini](https://github.com/inardini) |" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tvgnzT1CKxrO" - }, - "source": [ - "## Overview\n", - "\n", - "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", - "\n", - "This tutorial shows how to evaluate an CrewAI agent with customized template on Vertex AI Reasoning Engine using Vertex AI Gen AI Evaluation.\n", - "\n", - "The tutorial uses the following Google Cloud services and resources:\n", - "\n", - "* Vertex AI Gen AI Evaluation\n", - "* Vertex AI Reasoning Engine\n", - "\n", - "The steps performed include:\n", - "\n", - "* Build and deploy an agent using CrewAI on Vertex AI Reasoning Engine\n", - "* Prepare Agent Evaluation dataset\n", - "* Single tool usage evaluation\n", - "* Trajectory evaluation\n", - "* Response evaluation\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "61RBz8LLbxCR" - }, - "source": [ - "## Get started" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "No17Cw5hgx12" - }, - "source": [ - "### Install Vertex AI SDK and other required packages\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tFy3H3aPgx12" - }, - "outputs": [], - "source": [ - "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation, reasoningengine]\" \\\n", - " \"crewai\" \"crewai-tools\" \\\n", - " \"cloudpickle==3.0.0\" \\\n", - " \"pydantic==2.7.4\" \\\n", - " \"requests\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "R5Xep4W9lq-Z" - }, - "source": [ - "### Restart runtime\n", - "\n", - "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", - "\n", - "The restart might take a minute or longer. After it's restarted, continue to the next step." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XRvKdaPDTznN" - }, - "outputs": [], - "source": [ - "import IPython\n", - "\n", - "app = IPython.Application.instance()\n", - "app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SbmM4z7FOBpM" - }, - "source": [ - "
\n", - "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", - "
\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dmWOrTJ3gx13" - }, - "source": [ - "### Authenticate your notebook environment (Colab only)\n", - "\n", - "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NyKGtVQjgx13" - }, - "outputs": [], - "source": [ - "import sys\n", - "\n", - "if \"google.colab\" in sys.modules:\n", - " from google.colab import auth\n", - "\n", - " auth.authenticate_user()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF4l8DTdWgPY" - }, - "source": [ - "### Set Google Cloud project information and initialize Vertex AI SDK\n", - "\n", - "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", - "\n", - "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nqwi-5ufWp_B" - }, - "outputs": [], - "source": [ - "# Use the environment variable if the user doesn't provide Project ID.\n", - "import os\n", - "\n", - "import vertexai\n", - "\n", - "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", - "\n", - "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", - " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", - "\n", - "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", - "\n", - "BUCKET_NAME = \"[your-bucket-name]\" # @param {type: \"string\", placeholder: \"[your-bucket-name]\", isTemplate: true}\n", - "\n", - "if not BUCKET_NAME or BUCKET_NAME == \"[your-bucket-name]\":\n", - " BUCKET_NAME = f\"{PROJECT_ID}-bucket\"\n", - "\n", - "BUCKET_URI = f\"gs://evaluate_agents\"\n", - "\n", - "! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI\n", - "\n", - "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", - "\n", - "vertexai.init(\n", - " project=PROJECT_ID,\n", - " location=LOCATION,\n", - " staging_bucket=BUCKET_URI,\n", - " experiment=EXPERIMENT_NAME,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5303c05f7aa6" - }, - "source": [ - "## Import libraries\n", - "\n", - "Import tutorial libraries." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6fc324893334" - }, - "outputs": [], - "source": [ - "# General\n", - "import random\n", - "import string\n", - "\n", - "from IPython.display import HTML, Markdown, display\n", - "\n", - "# Build agent\n", - "from crewai import Agent, Crew, Process, Task\n", - "from crewai.flow.flow import Flow, listen, start\n", - "from crewai_tools import tool\n", - "\n", - "# Evaluate agent\n", - "from google.cloud import aiplatform\n", - "import pandas as pd\n", - "import plotly.graph_objects as go\n", - "from vertexai.preview import reasoning_engines\n", - "from vertexai.preview.evaluation import EvalTask\n", - "from vertexai.preview.evaluation.metrics import (\n", - " PointwiseMetric,\n", - " PointwiseMetricPromptTemplate,\n", - " TrajectorySingleToolUse,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MVnBDX54gz7j" - }, - "source": [ - "## Define helper functions\n", - "\n", - "Initiate a set of helper functions to print tutorial results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uSgWjMD_g1_v" - }, - "outputs": [], - "source": [ - "def get_id(length: int = 8) -> str:\n", - " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", - " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", - "\n", - "\n", - "def parse_crewai_output_to_dictionary(crew, crew_output):\n", - " \"\"\"\n", - " Parse CrewAI output into a structured dictionary format.\n", - " \"\"\"\n", - " final_output = {\"response\": str(crew_output), \"predicted_trajectory\": []}\n", - "\n", - " try:\n", - " # Access tools_results directly from each agent\n", - " for agent in crew.agents:\n", - " if hasattr(agent, \"tools_results\"):\n", - " for tool_result in agent.tools_results:\n", - " tool_info = {\n", - " \"tool_name\": tool_result.get(\"tool_name\", \"\"),\n", - " \"tool_input\": tool_result.get(\"tool_args\", {}),\n", - " }\n", - " final_output[\"predicted_trajectory\"].append(tool_info)\n", - "\n", - " except Exception as e:\n", - " final_output[\"error\"] = f\"Error parsing tools results: {str(e)}\"\n", - "\n", - " return final_output\n", - "\n", - "\n", - "def format_output_as_markdown(output: dict) -> str:\n", - " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", - " markdown = \"### AI Response\\n\"\n", - " markdown += f\"{output['response']}\\n\\n\"\n", - "\n", - " if output[\"predicted_trajectory\"]:\n", - " markdown += \"### Function Calls\\n\"\n", - " for call in output[\"predicted_trajectory\"]:\n", - " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", - " markdown += \" - **Arguments**:\\n\"\n", - " for key, value in call[\"tool_input\"].items():\n", - " markdown += f\" - `{key}`: `{value}`\\n\"\n", - "\n", - " return markdown\n", - "\n", - "\n", - "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", - " \"\"\"Display the evaluation results.\"\"\"\n", - " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", - " display(Markdown(\"### Summary Metrics\"))\n", - " display(metrics_df)\n", - "\n", - " display(Markdown(f\"### Row-wise Metrics\"))\n", - " display(eval_result.metrics_table)\n", - "\n", - "\n", - "def display_drilldown(row: pd.Series) -> None:\n", - " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", - "\n", - " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", - "\n", - " if not (\n", - " isinstance(row[\"predicted_trajectory\"], list)\n", - " and isinstance(row[\"reference_trajectory\"], list)\n", - " ):\n", - " return\n", - "\n", - " for predicted_trajectory, reference_trajectory in zip(\n", - " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", - " ):\n", - " display(\n", - " HTML(\n", - " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", - " )\n", - " )\n", - "\n", - " if not (\n", - " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", - " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", - " ):\n", - " continue\n", - "\n", - " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", - " print(\"Tool Input Key: \", tool_input_key)\n", - "\n", - " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", - " print(\n", - " \"Tool Values: \",\n", - " predicted_trajectory[\"tool_input\"][tool_input_key],\n", - " reference_trajectory[\"tool_input\"][tool_input_key],\n", - " )\n", - " else:\n", - " print(\n", - " \"Tool Values: \",\n", - " predicted_trajectory[\"tool_input\"][tool_input_key],\n", - " \"N/A\",\n", - " )\n", - " print(\"\\n\")\n", - " display(HTML(\"
\"))\n", - "\n", - "\n", - "def display_dataframe_rows(\n", - " df: pd.DataFrame,\n", - " columns: list[str] | None = None,\n", - " num_rows: int = 3,\n", - " display_drilldown: bool = False,\n", - ") -> None:\n", - " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", - "\n", - " if columns:\n", - " df = df[columns]\n", - "\n", - " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", - " header_style = base_style + \"font-weight: bold;\"\n", - "\n", - " for _, row in df.head(num_rows).iterrows():\n", - " for column in df.columns:\n", - " display(\n", - " HTML(\n", - " f\"{column.replace('_', ' ').title()}: \"\n", - " )\n", - " )\n", - " display(HTML(f\"{row[column]}
\"))\n", - "\n", - " display(HTML(\"
\"))\n", - "\n", - " if (\n", - " display_drilldown\n", - " and \"predicted_trajectory\" in df.columns\n", - " and \"reference_trajectory\" in df.columns\n", - " ):\n", - " display_drilldown(row)\n", - "\n", - "\n", - "def plot_bar_plot(\n", - " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", - ") -> None:\n", - " fig = go.Figure()\n", - " data = []\n", - "\n", - " summary_metrics = eval_result.summary_metrics\n", - " if metrics:\n", - " summary_metrics = {\n", - " k: summary_metrics[k]\n", - " for k, v in summary_metrics.items()\n", - " if any(selected_metric in k for selected_metric in metrics)\n", - " }\n", - "\n", - " data.append(\n", - " go.Bar(\n", - " x=list(summary_metrics.keys()),\n", - " y=list(summary_metrics.values()),\n", - " name=title,\n", - " )\n", - " )\n", - "\n", - " fig = go.Figure(data=data)\n", - "\n", - " # Change the bar mode\n", - " fig.update_layout(barmode=\"group\")\n", - " fig.show()\n", - "\n", - "\n", - "def display_radar_plot(eval_results, title: str, metrics=None):\n", - " \"\"\"Plot the radar plot.\"\"\"\n", - " fig = go.Figure()\n", - " summary_metrics = eval_results.summary_metrics\n", - " if metrics:\n", - " summary_metrics = {\n", - " k: summary_metrics[k]\n", - " for k, v in summary_metrics.items()\n", - " if any(selected_metric in k for selected_metric in metrics)\n", - " }\n", - "\n", - " min_val = min(summary_metrics.values())\n", - " max_val = max(summary_metrics.values())\n", - "\n", - " fig.add_trace(\n", - " go.Scatterpolar(\n", - " r=list(summary_metrics.values()),\n", - " theta=list(summary_metrics.keys()),\n", - " fill=\"toself\",\n", - " name=title,\n", - " )\n", - " )\n", - " fig.update_layout(\n", - " title=title,\n", - " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", - " showlegend=True,\n", - " )\n", - " fig.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bDaa2Mtsifmq" - }, - "source": [ - "## Build an agent using Vertex AI Reasoning Engine's customized template\n", - "\n", - "Build your application using CrewAI, including the Gemini model and custom tools that you define.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KHwShhpOitKp" - }, - "source": [ - "### Set tools\n", - "\n", - "To start, set the tools that a customer support agent needs to do their job." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gA2ZKvfeislw" - }, - "outputs": [], - "source": [ - "@tool\n", - "def get_product_details(product_name: str):\n", - " \"\"\"Gathers basic details about a product.\"\"\"\n", - " details = {\n", - " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", - " \"usb charger\": \"A super fast and light usb charger\",\n", - " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", - " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", - " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", - " }\n", - " return details.get(product_name, \"Product details not found.\")\n", - "\n", - "\n", - "@tool\n", - "def get_product_price(product_name: str):\n", - " \"\"\"Gathers price about a product.\"\"\"\n", - " details = {\n", - " \"smartphone\": 500,\n", - " \"usb charger\": 10,\n", - " \"shoes\": 100,\n", - " \"headphones\": 50,\n", - " \"speaker\": 80,\n", - " }\n", - " return details.get(product_name, \"Product price not found.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "be70714d9fae" - }, - "source": [ - "### Define router using Flow\n", - "\n", - "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "516b5108d327" - }, - "outputs": [], - "source": [ - "class ProductFlow(Flow):\n", - " @start\n", - " def begin_flow(self):\n", - " \"\"\"Starts the product information flow\"\"\"\n", - " return \"check_request\"\n", - "\n", - " @listen(\"check_request\")\n", - " def router(self, state: dict) -> str:\n", - " \"\"\"Routes the product request to appropriate handler\"\"\"\n", - " # Get the last message from the state\n", - " last_message = state.get(\"last_message\", {})\n", - " tool_calls = last_message.get(\"tool_calls\", [])\n", - "\n", - " if tool_calls:\n", - " function_name = tool_calls[0].get(\"name\")\n", - " if function_name == \"get_product_price\":\n", - " return \"get_product_price\"\n", - " else:\n", - " return \"get_product_details\"\n", - " return \"end\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FHjhBVx2cHWb" - }, - "source": [ - "### Set the model\n", - "\n", - "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "iCx9hbpccHWc" - }, - "outputs": [], - "source": [ - "model = \"vertex_ai/gemini-1.5-pro-002\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tNlAY9cojEWz" - }, - "source": [ - "### Assemble the agent\n", - "\n", - "To create a CrewAI agent using [Vertex AI Reasoning Engine](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy), use the [customized template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize).\n", - "\n", - "This class helps you quickly get an agent using any framework running Vertex AI Reasoning Engine.\n", - "\n", - "To learn more about the template, check out [Customize an application template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize) documentation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dAFdi7SujGP8" - }, - "outputs": [], - "source": [ - "class CrewAIApp:\n", - " def __init__(self, project: str, location: str, model: str = model) -> None:\n", - " self.project_id = project\n", - " self.location = location\n", - " self.model = model\n", - "\n", - " # The set_up method is used to define application initialization logic\n", - " def set_up(self) -> None:\n", - " \"\"\"Set up the application.\"\"\"\n", - " os.environ[\"GOOGLE_CLOUD_PROJECT\"] = self.project_id\n", - " return\n", - "\n", - " # The query method will be used to send inputs to the agent\n", - " def query(self, input: str):\n", - " \"\"\"Query the application.\"\"\"\n", - " product_researcher = Agent(\n", - " role=\"Product Researcher\",\n", - " goal=\"Research product details and prices accurately\",\n", - " backstory=\"Expert at gathering and analyzing product information\",\n", - " llm=model,\n", - " tools=[get_product_details, get_product_price],\n", - " allow_delegation=False,\n", - " )\n", - "\n", - " research_task = Task(\n", - " description=f\"Analyze this user request: '{input}'. \"\n", - " f\"If the request is about price, use get_product_price tool. \"\n", - " f\"Otherwise, use get_product_details tool to get product information.\",\n", - " expected_output=\"Product information including details and/or price based on the user request.\",\n", - " agent=product_researcher,\n", - " )\n", - "\n", - " crew = Crew(\n", - " agents=[product_researcher],\n", - " tasks=[research_task],\n", - " process=Process.sequential,\n", - " )\n", - "\n", - " result = crew.kickoff()\n", - " return parse_crewai_output_to_dictionary(crew, result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_HGcs6PVjRj_" - }, - "source": [ - "### Test the local agent\n", - "\n", - "Query your agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1dXLLgBudu_L" - }, - "outputs": [], - "source": [ - "local_custom_agent = CrewAIApp(project=PROJECT_ID, location=LOCATION)\n", - "local_custom_agent.set_up()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PgkOhPmN3aCZ" - }, - "outputs": [], - "source": [ - "response = local_custom_agent.query(input=\"Get product details for shoes\")\n", - "display(Markdown(format_output_as_markdown(response)))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "lGb58OJkjUs9" - }, - "outputs": [], - "source": [ - "response = local_custom_agent.query(input=\"Get product price for shoes\")\n", - "display(Markdown(format_output_as_markdown(response)))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2pSItXD5e4QD" - }, - "source": [ - "### Deploy the local agent to Vertex AI Reasoning Engine\n", - "\n", - "To deploy the local agent on Vertex AI Reasoning Engine, you can use the `create` method by passing the agent and some specify dependencies (`requirements` for external PyPI packages and `extra_packages` for local packages ).\n", - "\n", - "Look at [Deploy the application](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy#create_a_reasoningengine_instance) documentation page to learn more. \n", - "\n", - "> The agent deployment on Vertex AI Reasoning Engine would require ~ 10 mins." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "3HLz_a1We4QE" - }, - "outputs": [], - "source": [ - "local_custom_agent = CrewAIApp(project=PROJECT_ID, location=LOCATION)\n", - "\n", - "remote_custom_agent = reasoning_engines.ReasoningEngine.create(\n", - " local_custom_agent,\n", - " requirements=[\n", - " \"google-cloud-aiplatform[reasoningengine]\",\n", - " \"crewai\",\n", - " \"crewai-tools\",\n", - " \"cloudpickle==3.0.0\",\n", - " \"pydantic==2.7.4\",\n", - " \"requests\",\n", - " ],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nu4RO1P9e4QE" - }, - "source": [ - "### Test the remote agent\n", - "\n", - "Query your remote agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sqBtzYyce4QE" - }, - "outputs": [], - "source": [ - "response = remote_custom_agent.query(input=\"Get product details for shoes\")\n", - "display(Markdown(format_output_as_markdown(response)))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aOGPePsorpUl" - }, - "source": [ - "## Evaluating a custom agent with Vertex AI Gen AI Evaluation\n", - "\n", - "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", - "\n", - "Monitoring focuses on how well your agent is performing specific tasks:\n", - "\n", - "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", - "\n", - "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", - "\n", - "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", - "\n", - "Observability is about understanding the overall health of the agent:\n", - "\n", - "* **Latency**: How long does it take the agent to respond?\n", - "\n", - "* **Failure Rate**: How often does the agent fail to produce a response?\n", - "\n", - "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e43229f3ad4f" - }, - "source": [ - "### Prepare Agent Evaluation dataset\n", - "\n", - "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", - "\n", - "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", - "\n", - "\n", - "> Optionally, you can provide both generated responses and predicted trajectory (**bring-your-own-dataset scenario**).\n", - "\n", - "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fFf8uTdUiDt3" - }, - "outputs": [], - "source": [ - "eval_data = {\n", - " \"prompt\": [\n", - " \"Get price for smartphone\",\n", - " \"Get product details and price for headphones\",\n", - " \"Get details for usb charger\",\n", - " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\",\n", - " ],\n", - " \"reference_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - "}\n", - "\n", - "eval_sample_dataset = pd.DataFrame(eval_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PQEI1EcfvFHb" - }, - "source": [ - "Print some samples from the dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "EjsonqWWvIvE" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(eval_sample_dataset, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "htCrOS9fRVi8" - }, - "source": [ - "### Prepare an Agent function\n", - "\n", - "In this scenario with a custom agent, you need an agent function to pass the agent output to Vertex AI Gen AI Evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GdO56MIDRZri" - }, - "outputs": [], - "source": [ - "def agent_parsed_response(input: str) -> dict:\n", - " \"\"\"Pass the agent output to Vertex AI Gen AI Evaluation.\"\"\"\n", - "\n", - " result = remote_custom_agent.query(input=input)\n", - "\n", - " return result" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "m4CvBuf1afHG" - }, - "source": [ - "### Single tool usage evaluation\n", - "\n", - "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_rS5GGKHd5bx" - }, - "source": [ - "#### Set single tool usage metrics\n", - "\n", - "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", - "\n", - "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xixvq8dwd5by" - }, - "outputs": [], - "source": [ - "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ktKZoT2Qd5by" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using a local agent (local_1p_agent) and assigns a unique identifier to this specific evaluation run, storing the evaluation results.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QaMf9dqzySE6" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SRv43fDcd5by" - }, - "outputs": [], - "source": [ - "single_tool_call_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=single_tool_usage_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", - " runnable=agent_parsed_response, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(single_tool_call_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6o5BjSTFKVMS" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Use some helper functions to visualize a sample of evaluation result." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZkpwPReipekr" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JlujdJpu5Kn6" - }, - "source": [ - "### Trajectory Evaluation\n", - "\n", - "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8s-nHdDJneHM" - }, - "source": [ - "#### Set trajectory metrics\n", - "\n", - "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", - "\n", - "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", - "\n", - "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", - "\n", - "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", - "\n", - "* `trajectory_precision`: proportion of predicted actions present in reference\n", - "\n", - "* `trajectory_recall`: proportion of reference actions present in predicted. \n", - "\n", - "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "c32WIS95neHN" - }, - "outputs": [], - "source": [ - "trajectory_metrics = [\n", - " \"trajectory_exact_match\",\n", - " \"trajectory_in_order_match\",\n", - " \"trajectory_any_order_match\",\n", - " \"trajectory_precision\",\n", - " \"trajectory_recall\",\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF3jhTH3neHN" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vOdS7TJUneHN" - }, - "outputs": [], - "source": [ - "EXPERIMENT_NAME = \"evaluate-re-agent-trajectory\" # @param {type:\"string\"}\n", - "\n", - "trajectory_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", - ")\n", - "\n", - "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=agent_parsed_response)\n", - "\n", - "display_eval_report(trajectory_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DBiUI3LyLBtj" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Print and visualize a sample of evaluation results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sLVRdN5llA0h" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PrxM5sMZYXHP" - }, - "outputs": [], - "source": [ - "plot_bar_plot(\n", - " trajectory_eval_result,\n", - " title=\"Trajectory Metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "T8TipU2akHEd" - }, - "source": [ - "### Evaluate final response\n", - "\n", - "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DeK-py7ykkDN" - }, - "source": [ - "#### Set response metrics\n", - "\n", - "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", - "\n", - "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cyGHGgeVklvz" - }, - "outputs": [], - "source": [ - "response_metrics = [\"safety\", \"coherence\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DaBJWcg1kn55" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wRb2EC_hknSD" - }, - "outputs": [], - "source": [ - "EXPERIMENT_NAME = \"evaluate-re-agent-response\" # @param {type:\"string\"}\n", - "\n", - "response_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", - ")\n", - "\n", - "response_eval_result = response_eval_task.evaluate(runnable=agent_parsed_response)\n", - "\n", - "display_eval_report(response_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JtewTwiwg9qH" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "\n", - "Print new evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cy0aRydrp9zW" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ntRBK3Te6PEc" - }, - "source": [ - "### Evaluate generated response conditioned by tool choosing\n", - "\n", - "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", - "\n", - "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4bENwFcd6prX" - }, - "source": [ - "#### Define a custom metric\n", - "\n", - "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", - "\n", - "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "txGEHcg76riI" - }, - "outputs": [], - "source": [ - "criteria = {\n", - " \"Follows trajectory\": (\n", - " \"Evaluate whether the agent's response logically follows from the \"\n", - " \"sequence of actions it took. Consider these sub-points:\\n\"\n", - " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", - " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", - " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", - " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", - " )\n", - "}\n", - "\n", - "pointwise_rating_rubric = {\n", - " \"1\": \"Follows trajectory\",\n", - " \"0\": \"Does not follow trajectory\",\n", - "}\n", - "\n", - "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", - " criteria=criteria,\n", - " rating_rubric=pointwise_rating_rubric,\n", - " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8MJqXu0kikxd" - }, - "source": [ - "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5EL7iEDMikNQ" - }, - "outputs": [], - "source": [ - "print(response_follows_trajectory_prompt_template.prompt_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e1djVp7Fi4Yy" - }, - "source": [ - "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nx1xbZD87iMj" - }, - "outputs": [], - "source": [ - "response_follows_trajectory_metric = PointwiseMetric(\n", - " metric=\"response_follows_trajectory\",\n", - " metric_prompt_template=response_follows_trajectory_prompt_template,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1pmxLwTe7Ywv" - }, - "source": [ - "#### Set response metrics\n", - "\n", - "Set new generated response evaluation metrics by including the custom metric.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wrsbVFDd7Ywv" - }, - "outputs": [], - "source": [ - "response_tool_metrics = [\n", - " \"trajectory_exact_match\",\n", - " \"trajectory_in_order_match\",\n", - " \"safety\",\n", - " response_follows_trajectory_metric,\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Lo-Sza807Ywv" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "Run a new agent's evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_dkb4gSn7Ywv" - }, - "outputs": [], - "source": [ - "EXPERIMENT_NAME = \"evaluate-re-agent-response-by-tools\" # @param {type:\"string\"}\n", - "\n", - "response_eval_tool_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "response_eval_tool_result = response_eval_tool_task.evaluate(\n", - " runnable=agent_parsed_response\n", - ")\n", - "\n", - "display_eval_report(response_eval_tool_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EFmnRBlWqJnC" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "\n", - "Print new evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZODTRuq2lF75" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4nuUDP3a2eTB" - }, - "source": [ - "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", - "\n", - "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pNh3-NDuZGDl" - }, - "source": [ - "### Bring your own evaluation dataset\n", - "\n", - "Define the evaluation dataset with the predicted trajectory and the generated response." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "y9hBgsg324Ej" - }, - "outputs": [], - "source": [ - "byod_eval_data = {\n", - " \"prompt\": [\n", - " \"Get price for smartphone\",\n", - " \"Get product details and price for headphones\",\n", - " \"Get details for usb charger\",\n", - " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\",\n", - " ],\n", - " \"reference_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - " \"generated_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - " \"response\": [\n", - " 500,\n", - " 50,\n", - " \"A super fast and light usb charger\",\n", - " 100,\n", - " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", - " ],\n", - "}\n", - "\n", - "byod_eval_sample_dataset = pd.DataFrame(eval_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oEYmU2eJ7q-1" - }, - "source": [ - "### Run an evaluation task\n", - "\n", - "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wBD-4wpB7q-3" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", - "\n", - "byod_response_eval_tool_task = EvalTask(\n", - " dataset=byod_eval_sample_dataset,\n", - " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", - " runnable=agent_parsed_response, experiment_run_name=EXPERIMENT_RUN_NAME\n", - ")\n", - "\n", - "display_eval_report(byod_response_eval_tool_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9eU3LG6r7q-3" - }, - "source": [ - "### Visualize evaluation results\n", - "\n", - "Visualize evaluation result sample.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pQFzmd2I7q-3" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0FEbvEOkZS8f" - }, - "outputs": [], - "source": [ - "display_radar_plot(\n", - " byod_response_eval_tool_result,\n", - " title=\"Response Metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2a4e033321ad" - }, - "source": [ - "## Cleaning up\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Ox2I3UfRlTOd" - }, - "outputs": [], - "source": [ - "delete_experiment = True\n", - "delete_remote_agent = True\n", - "\n", - "if delete_experiment:\n", - " try:\n", - " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", - " experiment.delete(delete_backing_tensorboard_runs=True)\n", - " except Exception as e:\n", - " print(e)\n", - "\n", - "if delete_remote_agent:\n", - " try:\n", - " remote_custom_agent.delete()\n", - " except Exception as e:\n", - " print(e)" - ] - } - ], - "metadata": { - "colab": { - "name": "evaluating_crewai_agent_reasoning_engine_customized_template.ipynb", - "toc_visible": true - }, - "environment": { - "kernel": "python3", - "name": "common-cpu.m126", - "type": "gcloud", - "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/base-cpu:m126" - }, - "kernelspec": { - "display_name": "Python 3 (Local)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 4 + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Evaluate an CrewAI agent on Vertex AI Reasoning Engine (Customized template)\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Google
Open in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Open in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "Share to:\n", + "\n", + "\n", + " \"LinkedIn\n", + "\n", + "\n", + "\n", + " \"Bluesky\n", + "\n", + "\n", + "\n", + " \"X\n", + "\n", + "\n", + "\n", + " \"Reddit\n", + "\n", + "\n", + "\n", + " \"Facebook\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "84f0f73a0f76" + }, + "source": [ + "| | |\n", + "|-|-|\n", + "| Authors | [Naveksha Sood](https://github.com/navekshasood) [Ivan Nardini](https://github.com/inardini) |" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", + "\n", + "This tutorial shows how to evaluate an CrewAI agent with customized template on Vertex AI Reasoning Engine using Vertex AI Gen AI Evaluation.\n", + "\n", + "The tutorial uses the following Google Cloud services and resources:\n", + "\n", + "* Vertex AI Gen AI Evaluation\n", + "* Vertex AI Reasoning Engine\n", + "\n", + "The steps performed include:\n", + "\n", + "* Build and deploy an agent using CrewAI on Vertex AI Reasoning Engine\n", + "* Prepare Agent Evaluation dataset\n", + "* Single tool usage evaluation\n", + "* Trajectory evaluation\n", + "* Response evaluation\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "61RBz8LLbxCR" + }, + "source": [ + "## Get started" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "No17Cw5hgx12" + }, + "source": [ + "### Install Vertex AI SDK and other required packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tFy3H3aPgx12" + }, + "outputs": [], + "source": [ + "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation, reasoningengine]\" \\\n", + " \"crewai\" \"crewai-tools\" \\\n", + " \"cloudpickle==3.0.0\" \\\n", + " \"pydantic==2.7.4\" \\\n", + " \"requests\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R5Xep4W9lq-Z" + }, + "source": [ + "### Restart runtime\n", + "\n", + "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", + "\n", + "The restart might take a minute or longer. After it's restarted, continue to the next step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XRvKdaPDTznN" + }, + "outputs": [], + "source": [ + "import IPython\n", + "\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SbmM4z7FOBpM" + }, + "source": [ + "
\n", + "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dmWOrTJ3gx13" + }, + "source": [ + "### Authenticate your notebook environment (Colab only)\n", + "\n", + "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NyKGtVQjgx13" + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " from google.colab import auth\n", + "\n", + " auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF4l8DTdWgPY" + }, + "source": [ + "### Set Google Cloud project information and initialize Vertex AI SDK\n", + "\n", + "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nqwi-5ufWp_B" + }, + "outputs": [], + "source": [ + "# Use the environment variable if the user doesn't provide Project ID.\n", + "import os\n", + "\n", + "import vertexai\n", + "\n", + "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", + "\n", + "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", + " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", + "\n", + "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", + "\n", + "BUCKET_NAME = \"[your-bucket-name]\" # @param {type: \"string\", placeholder: \"[your-bucket-name]\", isTemplate: true}\n", + "\n", + "if not BUCKET_NAME or BUCKET_NAME == \"[your-bucket-name]\":\n", + " BUCKET_NAME = f\"{PROJECT_ID}-bucket\"\n", + "\n", + "BUCKET_URI = f\"gs://evaluate_agents\"\n", + "\n", + "! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI\n", + "\n", + "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", + "\n", + "vertexai.init(\n", + " project=PROJECT_ID,\n", + " location=LOCATION,\n", + " staging_bucket=BUCKET_URI,\n", + " experiment=EXPERIMENT_NAME,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5303c05f7aa6" + }, + "source": [ + "## Import libraries\n", + "\n", + "Import tutorial libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6fc324893334" + }, + "outputs": [], + "source": [ + "# General\n", + "import random\n", + "import string\n", + "\n", + "from IPython.display import HTML, Markdown, display\n", + "\n", + "# Build agent\n", + "from crewai import Agent, Crew, Process, Task\n", + "from crewai.flow.flow import Flow, listen, start\n", + "from crewai_tools import tool\n", + "\n", + "# Evaluate agent\n", + "from google.cloud import aiplatform\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "from vertexai.preview import reasoning_engines\n", + "from vertexai.preview.evaluation import EvalTask\n", + "from vertexai.preview.evaluation.metrics import (\n", + " PointwiseMetric,\n", + " PointwiseMetricPromptTemplate,\n", + " TrajectorySingleToolUse,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MVnBDX54gz7j" + }, + "source": [ + "## Define helper functions\n", + "\n", + "Initiate a set of helper functions to print tutorial results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uSgWjMD_g1_v" + }, + "outputs": [], + "source": [ + "def get_id(length: int = 8) -> str:\n", + " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", + " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", + "\n", + "\n", + "def parse_crewai_output_to_dictionary(crew, crew_output):\n", + " \"\"\"\n", + " Parse CrewAI output into a structured dictionary format.\n", + " \"\"\"\n", + " final_output = {\"response\": str(crew_output), \"predicted_trajectory\": []}\n", + "\n", + " try:\n", + " # Access tools_results directly from each agent\n", + " for agent in crew.agents:\n", + " if hasattr(agent, \"tools_results\"):\n", + " for tool_result in agent.tools_results:\n", + " tool_info = {\n", + " \"tool_name\": tool_result.get(\"tool_name\", \"\"),\n", + " \"tool_input\": tool_result.get(\"tool_args\", {}),\n", + " }\n", + " final_output[\"predicted_trajectory\"].append(tool_info)\n", + "\n", + " except Exception as e:\n", + " final_output[\"error\"] = f\"Error parsing tools results: {str(e)}\"\n", + "\n", + " return final_output\n", + "\n", + "\n", + "def format_output_as_markdown(output: dict) -> str:\n", + " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", + " markdown = \"### AI Response\\n\"\n", + " markdown += f\"{output['response']}\\n\\n\"\n", + "\n", + " if output[\"predicted_trajectory\"]:\n", + " markdown += \"### Function Calls\\n\"\n", + " for call in output[\"predicted_trajectory\"]:\n", + " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", + " markdown += \" - **Arguments**:\\n\"\n", + " for key, value in call[\"tool_input\"].items():\n", + " markdown += f\" - `{key}`: `{value}`\\n\"\n", + "\n", + " return markdown\n", + "\n", + "\n", + "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", + " \"\"\"Display the evaluation results.\"\"\"\n", + " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", + " display(Markdown(\"### Summary Metrics\"))\n", + " display(metrics_df)\n", + "\n", + " display(Markdown(f\"### Row-wise Metrics\"))\n", + " display(eval_result.metrics_table)\n", + "\n", + "\n", + "def display_drilldown(row: pd.Series) -> None:\n", + " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", + "\n", + " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", + "\n", + " if not (\n", + " isinstance(row[\"predicted_trajectory\"], list)\n", + " and isinstance(row[\"reference_trajectory\"], list)\n", + " ):\n", + " return\n", + "\n", + " for predicted_trajectory, reference_trajectory in zip(\n", + " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", + " ):\n", + " display(\n", + " HTML(\n", + " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", + " )\n", + " )\n", + "\n", + " if not (\n", + " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", + " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", + " ):\n", + " continue\n", + "\n", + " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", + " print(\"Tool Input Key: \", tool_input_key)\n", + "\n", + " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " reference_trajectory[\"tool_input\"][tool_input_key],\n", + " )\n", + " else:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " \"N/A\",\n", + " )\n", + " print(\"\\n\")\n", + " display(HTML(\"
\"))\n", + "\n", + "\n", + "def display_dataframe_rows(\n", + " df: pd.DataFrame,\n", + " columns: list[str] | None = None,\n", + " num_rows: int = 3,\n", + " display_drilldown: bool = False,\n", + ") -> None:\n", + " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", + "\n", + " if columns:\n", + " df = df[columns]\n", + "\n", + " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", + " header_style = base_style + \"font-weight: bold;\"\n", + "\n", + " for _, row in df.head(num_rows).iterrows():\n", + " for column in df.columns:\n", + " display(\n", + " HTML(\n", + " f\"{column.replace('_', ' ').title()}: \"\n", + " )\n", + " )\n", + " display(HTML(f\"{row[column]}
\"))\n", + "\n", + " display(HTML(\"
\"))\n", + "\n", + " if (\n", + " display_drilldown\n", + " and \"predicted_trajectory\" in df.columns\n", + " and \"reference_trajectory\" in df.columns\n", + " ):\n", + " display_drilldown(row)\n", + "\n", + "\n", + "def plot_bar_plot(\n", + " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", + ") -> None:\n", + " fig = go.Figure()\n", + " data = []\n", + "\n", + " summary_metrics = eval_result.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " data.append(\n", + " go.Bar(\n", + " x=list(summary_metrics.keys()),\n", + " y=list(summary_metrics.values()),\n", + " name=title,\n", + " )\n", + " )\n", + "\n", + " fig = go.Figure(data=data)\n", + "\n", + " # Change the bar mode\n", + " fig.update_layout(barmode=\"group\")\n", + " fig.show()\n", + "\n", + "\n", + "def display_radar_plot(eval_results, title: str, metrics=None):\n", + " \"\"\"Plot the radar plot.\"\"\"\n", + " fig = go.Figure()\n", + " summary_metrics = eval_results.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " min_val = min(summary_metrics.values())\n", + " max_val = max(summary_metrics.values())\n", + "\n", + " fig.add_trace(\n", + " go.Scatterpolar(\n", + " r=list(summary_metrics.values()),\n", + " theta=list(summary_metrics.keys()),\n", + " fill=\"toself\",\n", + " name=title,\n", + " )\n", + " )\n", + " fig.update_layout(\n", + " title=title,\n", + " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", + " showlegend=True,\n", + " )\n", + " fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bDaa2Mtsifmq" + }, + "source": [ + "## Build an agent using Vertex AI Reasoning Engine's customized template\n", + "\n", + "Build your application using CrewAI, including the Gemini model and custom tools that you define.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KHwShhpOitKp" + }, + "source": [ + "### Set tools\n", + "\n", + "To start, set the tools that a customer support agent needs to do their job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gA2ZKvfeislw" + }, + "outputs": [], + "source": [ + "@tool\n", + "def get_product_details(product_name: str):\n", + " \"\"\"Gathers basic details about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", + " \"usb charger\": \"A super fast and light usb charger\",\n", + " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", + " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", + " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " }\n", + " return details.get(product_name, \"Product details not found.\")\n", + "\n", + "\n", + "@tool\n", + "def get_product_price(product_name: str):\n", + " \"\"\"Gathers price about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": 500,\n", + " \"usb charger\": 10,\n", + " \"shoes\": 100,\n", + " \"headphones\": 50,\n", + " \"speaker\": 80,\n", + " }\n", + " return details.get(product_name, \"Product price not found.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "be70714d9fae" + }, + "source": [ + "### Define router using Flow\n", + "\n", + "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "516b5108d327" + }, + "outputs": [], + "source": [ + "class ProductFlow(Flow):\n", + " @start\n", + " def begin_flow(self):\n", + " \"\"\"Starts the product information flow\"\"\"\n", + " return \"check_request\"\n", + "\n", + " @listen(\"check_request\")\n", + " def router(self, state: dict) -> str:\n", + " \"\"\"Routes the product request to appropriate handler\"\"\"\n", + " # Get the last message from the state\n", + " last_message = state.get(\"last_message\", {})\n", + " tool_calls = last_message.get(\"tool_calls\", [])\n", + "\n", + " if tool_calls:\n", + " function_name = tool_calls[0].get(\"name\")\n", + " if function_name == \"get_product_price\":\n", + " return \"get_product_price\"\n", + " else:\n", + " return \"get_product_details\"\n", + " return \"end\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FHjhBVx2cHWb" + }, + "source": [ + "### Set the model\n", + "\n", + "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iCx9hbpccHWc" + }, + "outputs": [], + "source": [ + "model = \"vertex_ai/gemini-1.5-pro-002\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNlAY9cojEWz" + }, + "source": [ + "### Assemble the agent\n", + "\n", + "To create a CrewAI agent using [Vertex AI Reasoning Engine](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy), use the [customized template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize).\n", + "\n", + "This class helps you quickly get an agent using any framework running Vertex AI Reasoning Engine.\n", + "\n", + "To learn more about the template, check out [Customize an application template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize) documentation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dAFdi7SujGP8" + }, + "outputs": [], + "source": [ + "class CrewAIApp:\n", + " def __init__(self, project: str, location: str, model: str = model) -> None:\n", + " self.project_id = project\n", + " self.location = location\n", + " self.model = model\n", + "\n", + " # The set_up method is used to define application initialization logic\n", + " def set_up(self) -> None:\n", + " \"\"\"Set up the application.\"\"\"\n", + " os.environ[\"GOOGLE_CLOUD_PROJECT\"] = self.project_id\n", + " return\n", + "\n", + " # The query method will be used to send inputs to the agent\n", + " def query(self, input: str):\n", + " \"\"\"Query the application.\"\"\"\n", + " product_researcher = Agent(\n", + " role=\"Product Researcher\",\n", + " goal=\"Research product details and prices accurately\",\n", + " backstory=\"Expert at gathering and analyzing product information\",\n", + " llm=model,\n", + " tools=[get_product_details, get_product_price],\n", + " allow_delegation=False,\n", + " )\n", + "\n", + " research_task = Task(\n", + " description=f\"Analyze this user request: '{input}'. \"\n", + " f\"If the request is about price, use get_product_price tool. \"\n", + " f\"Otherwise, use get_product_details tool to get product information.\",\n", + " expected_output=\"Product information including details and/or price based on the user request.\",\n", + " agent=product_researcher,\n", + " )\n", + "\n", + " crew = Crew(\n", + " agents=[product_researcher],\n", + " tasks=[research_task],\n", + " process=Process.sequential,\n", + " )\n", + "\n", + " result = crew.kickoff()\n", + " return parse_crewai_output_to_dictionary(crew, result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_HGcs6PVjRj_" + }, + "source": [ + "### Test the local agent\n", + "\n", + "Query your agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1dXLLgBudu_L" + }, + "outputs": [], + "source": [ + "local_custom_agent = CrewAIApp(project=PROJECT_ID, location=LOCATION)\n", + "local_custom_agent.set_up()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PgkOhPmN3aCZ" + }, + "outputs": [], + "source": [ + "response = local_custom_agent.query(input=\"Get product details for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lGb58OJkjUs9" + }, + "outputs": [], + "source": [ + "response = local_custom_agent.query(input=\"Get product price for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2pSItXD5e4QD" + }, + "source": [ + "### Deploy the local agent to Vertex AI Reasoning Engine\n", + "\n", + "To deploy the local agent on Vertex AI Reasoning Engine, you can use the `create` method by passing the agent and some specify dependencies (`requirements` for external PyPI packages and `extra_packages` for local packages ).\n", + "\n", + "Look at [Deploy the application](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy#create_a_reasoningengine_instance) documentation page to learn more. \n", + "\n", + "> The agent deployment on Vertex AI Reasoning Engine would require ~ 10 mins." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3HLz_a1We4QE" + }, + "outputs": [], + "source": [ + "local_custom_agent = CrewAIApp(project=PROJECT_ID, location=LOCATION)\n", + "\n", + "remote_custom_agent = reasoning_engines.ReasoningEngine.create(\n", + " local_custom_agent,\n", + " requirements=[\n", + " \"google-cloud-aiplatform[reasoningengine]\",\n", + " \"crewai\",\n", + " \"crewai-tools\",\n", + " \"cloudpickle==3.0.0\",\n", + " \"pydantic==2.7.4\",\n", + " \"requests\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nu4RO1P9e4QE" + }, + "source": [ + "### Test the remote agent\n", + "\n", + "Query your remote agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sqBtzYyce4QE" + }, + "outputs": [], + "source": [ + "response = remote_custom_agent.query(input=\"Get product details for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aOGPePsorpUl" + }, + "source": [ + "## Evaluating a custom agent with Vertex AI Gen AI Evaluation\n", + "\n", + "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", + "\n", + "Monitoring focuses on how well your agent is performing specific tasks:\n", + "\n", + "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", + "\n", + "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", + "\n", + "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", + "\n", + "Observability is about understanding the overall health of the agent:\n", + "\n", + "* **Latency**: How long does it take the agent to respond?\n", + "\n", + "* **Failure Rate**: How often does the agent fail to produce a response?\n", + "\n", + "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e43229f3ad4f" + }, + "source": [ + "### Prepare Agent Evaluation dataset\n", + "\n", + "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", + "\n", + "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", + "\n", + "\n", + "> Optionally, you can provide both generated responses and predicted trajectory (**bring-your-own-dataset scenario**).\n", + "\n", + "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fFf8uTdUiDt3" + }, + "outputs": [], + "source": [ + "eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + "}\n", + "\n", + "eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PQEI1EcfvFHb" + }, + "source": [ + "Print some samples from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EjsonqWWvIvE" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(eval_sample_dataset, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "htCrOS9fRVi8" + }, + "source": [ + "### Prepare an Agent function\n", + "\n", + "In this scenario with a custom agent, you need an agent function to pass the agent output to Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GdO56MIDRZri" + }, + "outputs": [], + "source": [ + "def agent_parsed_response(input: str) -> dict:\n", + " \"\"\"Pass the agent output to Vertex AI Gen AI Evaluation.\"\"\"\n", + "\n", + " result = remote_custom_agent.query(input=input)\n", + "\n", + " return result" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m4CvBuf1afHG" + }, + "source": [ + "### Single tool usage evaluation\n", + "\n", + "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_rS5GGKHd5bx" + }, + "source": [ + "#### Set single tool usage metrics\n", + "\n", + "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", + "\n", + "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xixvq8dwd5by" + }, + "outputs": [], + "source": [ + "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ktKZoT2Qd5by" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using a local agent (local_1p_agent) and assigns a unique identifier to this specific evaluation run, storing the evaluation results.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QaMf9dqzySE6" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SRv43fDcd5by" + }, + "outputs": [], + "source": [ + "single_tool_call_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=single_tool_usage_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", + " runnable=agent_parsed_response, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(single_tool_call_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6o5BjSTFKVMS" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Use some helper functions to visualize a sample of evaluation result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZkpwPReipekr" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JlujdJpu5Kn6" + }, + "source": [ + "### Trajectory Evaluation\n", + "\n", + "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8s-nHdDJneHM" + }, + "source": [ + "#### Set trajectory metrics\n", + "\n", + "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", + "\n", + "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", + "\n", + "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", + "\n", + "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", + "\n", + "* `trajectory_precision`: proportion of predicted actions present in reference\n", + "\n", + "* `trajectory_recall`: proportion of reference actions present in predicted. \n", + "\n", + "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c32WIS95neHN" + }, + "outputs": [], + "source": [ + "trajectory_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"trajectory_any_order_match\",\n", + " \"trajectory_precision\",\n", + " \"trajectory_recall\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF3jhTH3neHN" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vOdS7TJUneHN" + }, + "outputs": [], + "source": [ + "EXPERIMENT_NAME = \"evaluate-re-agent-trajectory\" # @param {type:\"string\"}\n", + "\n", + "trajectory_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=agent_parsed_response)\n", + "\n", + "display_eval_report(trajectory_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DBiUI3LyLBtj" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Print and visualize a sample of evaluation results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sLVRdN5llA0h" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PrxM5sMZYXHP" + }, + "outputs": [], + "source": [ + "plot_bar_plot(\n", + " trajectory_eval_result,\n", + " title=\"Trajectory Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T8TipU2akHEd" + }, + "source": [ + "### Evaluate final response\n", + "\n", + "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DeK-py7ykkDN" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", + "\n", + "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cyGHGgeVklvz" + }, + "outputs": [], + "source": [ + "response_metrics = [\"safety\", \"coherence\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DaBJWcg1kn55" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wRb2EC_hknSD" + }, + "outputs": [], + "source": [ + "EXPERIMENT_NAME = \"evaluate-re-agent-response\" # @param {type:\"string\"}\n", + "\n", + "response_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "response_eval_result = response_eval_task.evaluate(runnable=agent_parsed_response)\n", + "\n", + "display_eval_report(response_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JtewTwiwg9qH" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cy0aRydrp9zW" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ntRBK3Te6PEc" + }, + "source": [ + "### Evaluate generated response conditioned by tool choosing\n", + "\n", + "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", + "\n", + "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4bENwFcd6prX" + }, + "source": [ + "#### Define a custom metric\n", + "\n", + "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", + "\n", + "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "txGEHcg76riI" + }, + "outputs": [], + "source": [ + "criteria = {\n", + " \"Follows trajectory\": (\n", + " \"Evaluate whether the agent's response logically follows from the \"\n", + " \"sequence of actions it took. Consider these sub-points:\\n\"\n", + " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", + " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", + " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", + " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", + " )\n", + "}\n", + "\n", + "pointwise_rating_rubric = {\n", + " \"1\": \"Follows trajectory\",\n", + " \"0\": \"Does not follow trajectory\",\n", + "}\n", + "\n", + "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", + " criteria=criteria,\n", + " rating_rubric=pointwise_rating_rubric,\n", + " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8MJqXu0kikxd" + }, + "source": [ + "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5EL7iEDMikNQ" + }, + "outputs": [], + "source": [ + "print(response_follows_trajectory_prompt_template.prompt_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e1djVp7Fi4Yy" + }, + "source": [ + "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nx1xbZD87iMj" + }, + "outputs": [], + "source": [ + "response_follows_trajectory_metric = PointwiseMetric(\n", + " metric=\"response_follows_trajectory\",\n", + " metric_prompt_template=response_follows_trajectory_prompt_template,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1pmxLwTe7Ywv" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "Set new generated response evaluation metrics by including the custom metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wrsbVFDd7Ywv" + }, + "outputs": [], + "source": [ + "response_tool_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"safety\",\n", + " response_follows_trajectory_metric,\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lo-Sza807Ywv" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_dkb4gSn7Ywv" + }, + "outputs": [], + "source": [ + "EXPERIMENT_NAME = \"evaluate-re-agent-response-by-tools\" # @param {type:\"string\"}\n", + "\n", + "response_eval_tool_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "response_eval_tool_result = response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_response\n", + ")\n", + "\n", + "display_eval_report(response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EFmnRBlWqJnC" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZODTRuq2lF75" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4nuUDP3a2eTB" + }, + "source": [ + "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", + "\n", + "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pNh3-NDuZGDl" + }, + "source": [ + "### Bring your own evaluation dataset\n", + "\n", + "Define the evaluation dataset with the predicted trajectory and the generated response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y9hBgsg324Ej" + }, + "outputs": [], + "source": [ + "byod_eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"generated_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"response\": [\n", + " 500,\n", + " 50,\n", + " \"A super fast and light usb charger\",\n", + " 100,\n", + " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " ],\n", + "}\n", + "\n", + "byod_eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oEYmU2eJ7q-1" + }, + "source": [ + "### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wBD-4wpB7q-3" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", + "\n", + "byod_response_eval_tool_task = EvalTask(\n", + " dataset=byod_eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_response, experiment_run_name=EXPERIMENT_RUN_NAME\n", + ")\n", + "\n", + "display_eval_report(byod_response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9eU3LG6r7q-3" + }, + "source": [ + "### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pQFzmd2I7q-3" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0FEbvEOkZS8f" + }, + "outputs": [], + "source": [ + "display_radar_plot(\n", + " byod_response_eval_tool_result,\n", + " title=\"Response Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2a4e033321ad" + }, + "source": [ + "## Cleaning up\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ox2I3UfRlTOd" + }, + "outputs": [], + "source": [ + "delete_experiment = True\n", + "delete_remote_agent = True\n", + "\n", + "if delete_experiment:\n", + " try:\n", + " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", + " experiment.delete(delete_backing_tensorboard_runs=True)\n", + " except Exception as e:\n", + " print(e)\n", + "\n", + "if delete_remote_agent:\n", + " try:\n", + " remote_custom_agent.delete()\n", + " except Exception as e:\n", + " print(e)" + ] + } + ], + "metadata": { + "colab": { + "name": "evaluating_crewai_agent_reasoning_engine_customized_template.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb b/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb index 7db11bfc2d..3ba858d797 100644 --- a/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb +++ b/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb @@ -1,1548 +1,1529 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ur8xi4C7S06n" - }, - "outputs": [], - "source": [ - "# Copyright 2024 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JAPoU8Sm5E6e" - }, - "source": [ - "# Evaluating an LangChain Agent on Vertex AI Reasoning Engine (Prebuilt template)\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \"Google
Open in Colab\n", - "
\n", - "
\n", - " \n", - " \"Google
Open in Colab Enterprise\n", - "
\n", - "
\n", - " \n", - " \"Vertex
Open in Vertex AI Workbench\n", - "
\n", - "
\n", - " \n", - " \"GitHub
View on GitHub\n", - "
\n", - "
\n", - "\n", - "
\n", - "\n", - "Share to:\n", - "\n", - "\n", - " \"LinkedIn\n", - "\n", - "\n", - "\n", - " \"Bluesky\n", - "\n", - "\n", - "\n", - " \"X\n", - "\n", - "\n", - "\n", - " \"Reddit\n", - "\n", - "\n", - "\n", - " \"Facebook\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "84f0f73a0f76" - }, - "source": [ - "| | |\n", - "|-|-|\n", - "| Authors | [Naveksha Sood](https://github.com/navekshasood) [Ivan Nardini](https://github.com/inardini) |" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tvgnzT1CKxrO" - }, - "source": [ - "## Overview\n", - "\n", - "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", - "\n", - "This tutorial shows how to evaluate a first-party Reasoning Engine Agent using Vertex AI Gen AI Evaluation for agent evaluation.\n", - "\n", - "The tutorial uses the following Google Cloud services and resources:\n", - "\n", - "* Vertex AI Gen AI Evaluation\n", - "* Vertex AI Reasoning Engine\n", - "\n", - "The steps performed include:\n", - "\n", - "* Build and deploy an agent using LangChain\n", - "* Prepare Agent Evaluation dataset\n", - "* Single tool usage evaluation\n", - "* Trajectory evaluation\n", - "* Response evaluation" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "61RBz8LLbxCR" - }, - "source": [ - "## Get started" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "No17Cw5hgx12" - }, - "source": [ - "### Install Vertex AI SDK and other required packages\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tFy3H3aPgx12" - }, - "outputs": [], - "source": [ - "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation, langchain, reasoningengine]\" \\\n", - " \"langchain_google_vertexai\" \\\n", - " \"cloudpickle==3.0.0\" \\\n", - " \"pydantic==2.7.4\" \\\n", - " \"requests\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "R5Xep4W9lq-Z" - }, - "source": [ - "### Restart runtime\n", - "\n", - "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", - "\n", - "The restart might take a minute or longer. After it's restarted, continue to the next step." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XRvKdaPDTznN" - }, - "outputs": [], - "source": [ - "import IPython\n", - "\n", - "app = IPython.Application.instance()\n", - "app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SbmM4z7FOBpM" - }, - "source": [ - "
\n", - "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", - "
\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dmWOrTJ3gx13" - }, - "source": [ - "### Authenticate your notebook environment (Colab only)\n", - "\n", - "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NyKGtVQjgx13" - }, - "outputs": [], - "source": [ - "import sys\n", - "\n", - "if \"google.colab\" in sys.modules:\n", - " from google.colab import auth\n", - "\n", - " auth.authenticate_user()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF4l8DTdWgPY" - }, - "source": [ - "### Set Google Cloud project information and initialize Vertex AI SDK\n", - "\n", - "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", - "\n", - "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nqwi-5ufWp_B" - }, - "outputs": [], - "source": [ - "# Use the environment variable if the user doesn't provide Project ID.\n", - "import os\n", - "\n", - "import vertexai\n", - "\n", - "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", - "\n", - "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", - " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", - "\n", - "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", - "\n", - "BUCKET_NAME = \"[your-bucket-name]\" # @param {type: \"string\", placeholder: \"[your-bucket-name]\", isTemplate: true}\n", - "BUCKET_URI = f\"gs://evaluate_agents\"\n", - "\n", - "if not BUCKET_NAME or BUCKET_NAME == \"[your-bucket-name]\":\n", - " BUCKET_NAME = f\"{PROJECT_ID}-bucket\"\n", - "\n", - "! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI\n", - "\n", - "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", - "\n", - "vertexai.init(\n", - " project=PROJECT_ID,\n", - " location=LOCATION,\n", - " staging_bucket=BUCKET_URI,\n", - " experiment=EXPERIMENT_NAME,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5303c05f7aa6" - }, - "source": [ - "## Import libraries\n", - "\n", - "Import tutorial libraries." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6fc324893334" - }, - "outputs": [], - "source": [ - "# General\n", - "import random\n", - "import string\n", - "\n", - "from IPython.display import HTML, Markdown, display\n", - "\n", - "# Build agent\n", - "from google.cloud import aiplatform\n", - "import pandas as pd\n", - "import plotly.graph_objects as go\n", - "from vertexai.preview import reasoning_engines\n", - "\n", - "# Evaluate agent\n", - "from vertexai.preview.evaluation import EvalTask\n", - "from vertexai.preview.evaluation.metrics import (\n", - " PointwiseMetric,\n", - " PointwiseMetricPromptTemplate,\n", - " TrajectorySingleToolUse,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MVnBDX54gz7j" - }, - "source": [ - "## Define helper functions\n", - "\n", - "Initiate a set of helper functions to print tutorial results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uSgWjMD_g1_v" - }, - "outputs": [], - "source": [ - "def get_id(length: int = 8) -> str:\n", - " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", - " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", - "\n", - "\n", - "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", - " \"\"\"Display the evaluation results.\"\"\"\n", - " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", - " display(Markdown(\"### Summary Metrics\"))\n", - " display(metrics_df)\n", - "\n", - " display(Markdown(f\"### Row-wise Metrics\"))\n", - " display(eval_result.metrics_table)\n", - "\n", - "\n", - "def display_drilldown(row: pd.Series) -> None:\n", - " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", - "\n", - " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", - "\n", - " if not (\n", - " isinstance(row[\"predicted_trajectory\"], list)\n", - " and isinstance(row[\"reference_trajectory\"], list)\n", - " ):\n", - " return\n", - "\n", - " for predicted_trajectory, reference_trajectory in zip(\n", - " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", - " ):\n", - " display(\n", - " HTML(\n", - " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", - " )\n", - " )\n", - "\n", - " if not (\n", - " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", - " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", - " ):\n", - " continue\n", - "\n", - " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", - " print(\"Tool Input Key: \", tool_input_key)\n", - "\n", - " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", - " print(\n", - " \"Tool Values: \",\n", - " predicted_trajectory[\"tool_input\"][tool_input_key],\n", - " reference_trajectory[\"tool_input\"][tool_input_key],\n", - " )\n", - " else:\n", - " print(\n", - " \"Tool Values: \",\n", - " predicted_trajectory[\"tool_input\"][tool_input_key],\n", - " \"N/A\",\n", - " )\n", - " print(\"\\n\")\n", - " display(HTML(\"
\"))\n", - "\n", - "\n", - "def display_dataframe_rows(\n", - " df: pd.DataFrame,\n", - " columns: list[str] | None = None,\n", - " num_rows: int = 3,\n", - " display_drilldown: bool = False,\n", - ") -> None:\n", - " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", - "\n", - " if columns:\n", - " df = df[columns]\n", - "\n", - " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", - " header_style = base_style + \"font-weight: bold;\"\n", - "\n", - " for _, row in df.head(num_rows).iterrows():\n", - " for column in df.columns:\n", - " display(\n", - " HTML(\n", - " f\"{column.replace('_', ' ').title()}: \"\n", - " )\n", - " )\n", - " display(HTML(f\"{row[column]}
\"))\n", - "\n", - " display(HTML(\"
\"))\n", - "\n", - " if (\n", - " display_drilldown\n", - " and \"predicted_trajectory\" in df.columns\n", - " and \"reference_trajectory\" in df.columns\n", - " ):\n", - " display_drilldown(row)\n", - "\n", - "\n", - "def plot_bar_plot(\n", - " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", - ") -> None:\n", - " fig = go.Figure()\n", - " data = []\n", - "\n", - " summary_metrics = eval_result.summary_metrics\n", - " if metrics:\n", - " summary_metrics = {\n", - " k: summary_metrics[k]\n", - " for k, v in summary_metrics.items()\n", - " if any(selected_metric in k for selected_metric in metrics)\n", - " }\n", - "\n", - " data.append(\n", - " go.Bar(\n", - " x=list(summary_metrics.keys()),\n", - " y=list(summary_metrics.values()),\n", - " name=title,\n", - " )\n", - " )\n", - "\n", - " fig = go.Figure(data=data)\n", - "\n", - " # Change the bar mode\n", - " fig.update_layout(barmode=\"group\")\n", - " fig.show()\n", - "\n", - "\n", - "def display_radar_plot(eval_results, title: str, metrics=None):\n", - " \"\"\"Plot the radar plot.\"\"\"\n", - " fig = go.Figure()\n", - " summary_metrics = eval_results.summary_metrics\n", - " if metrics:\n", - " summary_metrics = {\n", - " k: summary_metrics[k]\n", - " for k, v in summary_metrics.items()\n", - " if any(selected_metric in k for selected_metric in metrics)\n", - " }\n", - "\n", - " min_val = min(summary_metrics.values())\n", - " max_val = max(summary_metrics.values())\n", - "\n", - " fig.add_trace(\n", - " go.Scatterpolar(\n", - " r=list(summary_metrics.values()),\n", - " theta=list(summary_metrics.keys()),\n", - " fill=\"toself\",\n", - " name=title,\n", - " )\n", - " )\n", - " fig.update_layout(\n", - " title=title,\n", - " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", - " showlegend=True,\n", - " )\n", - " fig.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bDaa2Mtsifmq" - }, - "source": [ - "## Build and deploy a LangChain agent using Vertex AI Reasoning Engine's prebuilt template\n", - "\n", - "Build and deploy your application using LangChain, including the Gemini model and custom tools that you define.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KHwShhpOitKp" - }, - "source": [ - "### Set tools\n", - "\n", - "To start, create some tools the agent will need to do their job. We are just going to pretend there's a database for this colab, but you would wire into your database or third party system for a real agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gA2ZKvfeislw" - }, - "outputs": [], - "source": [ - "def get_product_details(product_name: str):\n", - " \"\"\"Gathers basic details about a product.\"\"\"\n", - " details = {\n", - " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", - " \"usb charger\": \"A super fast and light usb charger\",\n", - " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", - " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", - " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", - " }\n", - " return details.get(product_name, \"Product details not found.\")\n", - "\n", - "\n", - "def get_product_price(product_name: str):\n", - " \"\"\"Gathers price about a product.\"\"\"\n", - " details = {\n", - " \"smartphone\": 500,\n", - " \"usb charger\": 10,\n", - " \"shoes\": 100,\n", - " \"headphones\": 50,\n", - " \"speaker\": 80,\n", - " }\n", - " return details.get(product_name, \"Product price not found.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "l4mk5XPui4Y1" - }, - "source": [ - "### Set the model\n", - "\n", - "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "BaYeo6K2i-w1" - }, - "outputs": [], - "source": [ - "model = \"gemini-1.5-pro\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tNlAY9cojEWz" - }, - "source": [ - "### Assemble the agent\n", - "\n", - "To create a LangChain agent using [Vertex AI Reasoning Engine](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy), use the LangchainAgent class. This class helps you quickly get an agent running with a standard template. Think of it as a shortcut for building agents – you don't have to start from scratch. The LangchainAgent handles the basic structure and initial configuration, allowing you to get right into using the agent.\n", - "\n", - "> Note the additional parameter `agent_executor_kwargs` which would allow to return tool calls made by agent so you can evaluate them.\n", - "\n", - "The Vertex AI GenAI Evaluation works directly with 'Queryable' agents (like in this case), and also lets you add your own custom functions with a specific structure (signature)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dAFdi7SujGP8" - }, - "outputs": [], - "source": [ - "local_1p_agent = reasoning_engines.LangchainAgent(\n", - " model=model,\n", - " tools=[get_product_details, get_product_price],\n", - " agent_executor_kwargs={\"return_intermediate_steps\": True},\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_HGcs6PVjRj_" - }, - "source": [ - "### Test the local agent\n", - "\n", - "Query your agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "lGb58OJkjUs9" - }, - "outputs": [], - "source": [ - "response = local_1p_agent.query(input=\"Get product details for shoes\")\n", - "display(Markdown(response[\"output\"]))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "INqf60zPWP6L" - }, - "outputs": [], - "source": [ - "response = local_1p_agent.query(input=\"Get product price for shoes\")\n", - "display(Markdown(response[\"output\"]))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dP5g16W1rzMI" - }, - "source": [ - "### Deploy the local agent to Vertex AI Reasoning Engine\n", - "\n", - "To deploy the local agent on Vertex AI Reasoning Engine, you can use the `create` method by passing the agent and some specify dependencies (`requirements` for external PyPI packages and `extra_packages` for local packages ).\n", - "\n", - "Look at [Deploy the application](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy#create_a_reasoningengine_instance) documentation page to learn more. \n", - "\n", - "> The agent deployment on Vertex AI Reasoning Engine would require ~ 10 mins." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GPNpD676r6T2" - }, - "outputs": [], - "source": [ - "remote_1p_agent = reasoning_engines.ReasoningEngine.create(\n", - " local_1p_agent,\n", - " requirements=[\n", - " \"google-cloud-aiplatform[langchain,reasoningengine]\",\n", - " \"langchain_google_vertexai\",\n", - " \"cloudpickle==3.0.0\",\n", - " \"pydantic==2.7.4\",\n", - " \"requests\",\n", - " ],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GjZMd82vHRh3" - }, - "source": [ - "### Test the remote agent\n", - "\n", - "Query your remote agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "KSCznbhbHRh3" - }, - "outputs": [], - "source": [ - "response = remote_1p_agent.query(input=\"Get product details for shoes\")\n", - "display(Markdown(response[\"output\"]))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aOGPePsorpUl" - }, - "source": [ - "## Evaluating an agent with Vertex AI Gen AI Evaluation\n", - "\n", - "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", - "\n", - "Monitoring focuses on how well your agent is performing specific tasks:\n", - "\n", - "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", - "\n", - "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", - "\n", - "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", - "\n", - "Observability is about understanding the overall health of the agent:\n", - "\n", - "* **Latency**: How long does it take the agent to respond?\n", - "\n", - "* **Failure Rate**: How often does the agent fail to produce a response?\n", - "\n", - "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e43229f3ad4f" - }, - "source": [ - "### Prepare Agent Evaluation dataset\n", - "\n", - "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", - "\n", - "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", - "\n", - "> Optionally, you can provide both generated responses and predicted trajectory (**bring-your-own-dataset scenario**).\n", - "\n", - "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fFf8uTdUiDt3" - }, - "outputs": [], - "source": [ - "eval_data = {\n", - " \"prompt\": [\n", - " \"Get price for smartphone\",\n", - " \"Get product details and price for headphones\",\n", - " \"Get details for usb charger\",\n", - " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\",\n", - " ],\n", - " \"reference_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - "}\n", - "\n", - "eval_sample_dataset = pd.DataFrame(eval_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PQEI1EcfvFHb" - }, - "source": [ - "Print some samples from the dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "EjsonqWWvIvE" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(eval_sample_dataset, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "m4CvBuf1afHG" - }, - "source": [ - "### Single tool usage evaluation\n", - "\n", - "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_rS5GGKHd5bx" - }, - "source": [ - "#### Set single tool usage metrics\n", - "\n", - "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", - "\n", - "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xixvq8dwd5by" - }, - "outputs": [], - "source": [ - "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ktKZoT2Qd5by" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using the remote agent and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QaMf9dqzySE6" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SRv43fDcd5by" - }, - "outputs": [], - "source": [ - "single_tool_call_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=single_tool_usage_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", - " runnable=remote_1p_agent, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(single_tool_call_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6o5BjSTFKVMS" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Use some helper functions to visualize a sample of evaluation result." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1Jopzw83k14w" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JlujdJpu5Kn6" - }, - "source": [ - "### Trajectory Evaluation\n", - "\n", - "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8s-nHdDJneHM" - }, - "source": [ - "#### Set trajectory metrics\n", - "\n", - "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", - "\n", - "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", - "\n", - "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", - "\n", - "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", - "\n", - "* `trajectory_precision`: proportion of predicted actions present in reference\n", - "\n", - "* `trajectory_recall`: proportion of reference actions present in predicted. \n", - "\n", - "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "c32WIS95neHN" - }, - "outputs": [], - "source": [ - "trajectory_metrics = [\n", - " \"trajectory_exact_match\",\n", - " \"trajectory_in_order_match\",\n", - " \"trajectory_any_order_match\",\n", - " \"trajectory_precision\",\n", - " \"trajectory_recall\",\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF3jhTH3neHN" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vOdS7TJUneHN" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"trajectory-{get_id()}\"\n", - "\n", - "trajectory_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", - ")\n", - "\n", - "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=remote_1p_agent)\n", - "\n", - "display_eval_report(trajectory_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DBiUI3LyLBtj" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Print and visualize a sample of evaluation results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sLVRdN5llA0h" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PrxM5sMZYXHP" - }, - "outputs": [], - "source": [ - "plot_bar_plot(\n", - " trajectory_eval_result,\n", - " title=\"Trajectory Metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "T8TipU2akHEd" - }, - "source": [ - "### Evaluate final response\n", - "\n", - "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DeK-py7ykkDN" - }, - "source": [ - "#### Set response metrics\n", - "\n", - "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", - "\n", - "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cyGHGgeVklvz" - }, - "outputs": [], - "source": [ - "response_metrics = [\"safety\", \"coherence\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DaBJWcg1kn55" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wRb2EC_hknSD" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"response-{get_id()}\"\n", - "\n", - "response_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", - ")\n", - "\n", - "response_eval_result = response_eval_task.evaluate(runnable=remote_1p_agent)\n", - "\n", - "display_eval_report(response_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JtewTwiwg9qH" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "\n", - "Print new evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZODTRuq2lF75" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ntRBK3Te6PEc" - }, - "source": [ - "### Evaluate generated response conditioned by tool choosing.\n", - "\n", - "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", - "\n", - "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4bENwFcd6prX" - }, - "source": [ - "#### Define a custom metric\n", - "\n", - "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", - "\n", - "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "txGEHcg76riI" - }, - "outputs": [], - "source": [ - "criteria = {\n", - " \"Follows trajectory\": (\n", - " \"Evaluate whether the agent's response logically follows from the \"\n", - " \"sequence of actions it took. Consider these sub-points:\\n\"\n", - " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", - " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", - " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", - " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", - " )\n", - "}\n", - "\n", - "pointwise_rating_rubric = {\n", - " \"1\": \"Follows trajectory\",\n", - " \"0\": \"Does not follow trajectory\",\n", - "}\n", - "\n", - "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", - " criteria=criteria,\n", - " rating_rubric=pointwise_rating_rubric,\n", - " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8MJqXu0kikxd" - }, - "source": [ - "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5EL7iEDMikNQ" - }, - "outputs": [], - "source": [ - "print(response_follows_trajectory_prompt_template.prompt_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e1djVp7Fi4Yy" - }, - "source": [ - "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nx1xbZD87iMj" - }, - "outputs": [], - "source": [ - "response_follows_trajectory_metric = PointwiseMetric(\n", - " metric=\"response_follows_trajectory\",\n", - " metric_prompt_template=response_follows_trajectory_prompt_template,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1pmxLwTe7Ywv" - }, - "source": [ - "#### Set response metrics\n", - "\n", - "Set new generated response evaluation metrics by including the custom metric.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wrsbVFDd7Ywv" - }, - "outputs": [], - "source": [ - "response_tool_metrics = [\n", - " \"trajectory_exact_match\",\n", - " \"trajectory_in_order_match\",\n", - " \"safety\",\n", - " response_follows_trajectory_metric,\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Lo-Sza807Ywv" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "Run a new agent's evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_dkb4gSn7Ywv" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"response-over-tools-{get_id()}\"\n", - "\n", - "response_eval_tool_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "response_eval_tool_result = response_eval_tool_task.evaluate(runnable=remote_1p_agent)\n", - "\n", - "display_eval_report(response_eval_tool_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AtOfIFi2j88g" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Visualize evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GH2YvXgLlLH7" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(response_eval_tool_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4nuUDP3a2eTB" - }, - "source": [ - "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", - "\n", - "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pNh3-NDuZGDl" - }, - "source": [ - "### Bring your own evaluation dataset\n", - "\n", - "Define the evaluation dataset with the predicted trajectory and the generated response." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "y9hBgsg324Ej" - }, - "outputs": [], - "source": [ - "byod_eval_data = {\n", - " \"prompt\": [\n", - " \"Get price for smartphone\",\n", - " \"Get product details and price for headphones\",\n", - " \"Get details for usb charger\",\n", - " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\",\n", - " ],\n", - " \"reference_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - " \"generated_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - " \"response\": [\n", - " 500,\n", - " 50,\n", - " \"A super fast and light usb charger\",\n", - " 100,\n", - " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", - " ],\n", - "}\n", - "\n", - "byod_eval_sample_dataset = pd.DataFrame(eval_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oEYmU2eJ7q-1" - }, - "source": [ - "### Run an evaluation task\n", - "\n", - "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wBD-4wpB7q-3" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", - "\n", - "byod_response_eval_tool_task = EvalTask(\n", - " dataset=byod_eval_sample_dataset,\n", - " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", - " runnable=remote_1p_agent, experiment_run_name=EXPERIMENT_RUN_NAME\n", - ")\n", - "\n", - "display_eval_report(byod_response_eval_tool_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9eU3LG6r7q-3" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Visualize evaluation result sample.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pQFzmd2I7q-3" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0FEbvEOkZS8f" - }, - "outputs": [], - "source": [ - "display_radar_plot(\n", - " byod_response_eval_tool_result,\n", - " title=\"Response Metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2a4e033321ad" - }, - "source": [ - "## Cleaning up\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Ox2I3UfRlTOd" - }, - "outputs": [], - "source": [ - "delete_experiment = True\n", - "delete_remote_agent = True\n", - "\n", - "if delete_experiment:\n", - " try:\n", - " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", - " experiment.delete(delete_backing_tensorboard_runs=True)\n", - " except Exception as e:\n", - " print(e)\n", - "\n", - "if delete_remote_agent:\n", - " try:\n", - " remote_1p_agent.delete()\n", - " except Exception as e:\n", - " print(e)" - ] - } - ], - "metadata": { - "colab": { - "name": "evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb", - "toc_visible": true - }, - "environment": { - "kernel": "python3", - "name": "common-cpu.m126", - "type": "gcloud", - "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/base-cpu:m126" - }, - "kernelspec": { - "display_name": "Python 3 (Local)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 4 + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Evaluating an LangChain Agent on Vertex AI Reasoning Engine (Prebuilt template)\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Google
Open in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Open in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "Share to:\n", + "\n", + "\n", + " \"LinkedIn\n", + "\n", + "\n", + "\n", + " \"Bluesky\n", + "\n", + "\n", + "\n", + " \"X\n", + "\n", + "\n", + "\n", + " \"Reddit\n", + "\n", + "\n", + "\n", + " \"Facebook\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "84f0f73a0f76" + }, + "source": [ + "| | |\n", + "|-|-|\n", + "| Authors | [Naveksha Sood](https://github.com/navekshasood) [Ivan Nardini](https://github.com/inardini) |" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", + "\n", + "This tutorial shows how to evaluate a first-party Reasoning Engine Agent using Vertex AI Gen AI Evaluation for agent evaluation.\n", + "\n", + "The tutorial uses the following Google Cloud services and resources:\n", + "\n", + "* Vertex AI Gen AI Evaluation\n", + "* Vertex AI Reasoning Engine\n", + "\n", + "The steps performed include:\n", + "\n", + "* Build and deploy an agent using LangChain\n", + "* Prepare Agent Evaluation dataset\n", + "* Single tool usage evaluation\n", + "* Trajectory evaluation\n", + "* Response evaluation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "61RBz8LLbxCR" + }, + "source": [ + "## Get started" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "No17Cw5hgx12" + }, + "source": [ + "### Install Vertex AI SDK and other required packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tFy3H3aPgx12" + }, + "outputs": [], + "source": [ + "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation, langchain, reasoningengine]\" \\\n", + " \"langchain_google_vertexai\" \\\n", + " \"cloudpickle==3.0.0\" \\\n", + " \"pydantic==2.7.4\" \\\n", + " \"requests\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R5Xep4W9lq-Z" + }, + "source": [ + "### Restart runtime\n", + "\n", + "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", + "\n", + "The restart might take a minute or longer. After it's restarted, continue to the next step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XRvKdaPDTznN" + }, + "outputs": [], + "source": [ + "import IPython\n", + "\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SbmM4z7FOBpM" + }, + "source": [ + "
\n", + "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dmWOrTJ3gx13" + }, + "source": [ + "### Authenticate your notebook environment (Colab only)\n", + "\n", + "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NyKGtVQjgx13" + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " from google.colab import auth\n", + "\n", + " auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF4l8DTdWgPY" + }, + "source": [ + "### Set Google Cloud project information and initialize Vertex AI SDK\n", + "\n", + "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nqwi-5ufWp_B" + }, + "outputs": [], + "source": [ + "# Use the environment variable if the user doesn't provide Project ID.\n", + "import os\n", + "\n", + "import vertexai\n", + "\n", + "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", + "\n", + "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", + " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", + "\n", + "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", + "\n", + "BUCKET_NAME = \"[your-bucket-name]\" # @param {type: \"string\", placeholder: \"[your-bucket-name]\", isTemplate: true}\n", + "BUCKET_URI = f\"gs://evaluate_agents\"\n", + "\n", + "if not BUCKET_NAME or BUCKET_NAME == \"[your-bucket-name]\":\n", + " BUCKET_NAME = f\"{PROJECT_ID}-bucket\"\n", + "\n", + "! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI\n", + "\n", + "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", + "\n", + "vertexai.init(\n", + " project=PROJECT_ID,\n", + " location=LOCATION,\n", + " staging_bucket=BUCKET_URI,\n", + " experiment=EXPERIMENT_NAME,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5303c05f7aa6" + }, + "source": [ + "## Import libraries\n", + "\n", + "Import tutorial libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6fc324893334" + }, + "outputs": [], + "source": [ + "# General\n", + "import random\n", + "import string\n", + "\n", + "from IPython.display import HTML, Markdown, display\n", + "\n", + "# Build agent\n", + "from google.cloud import aiplatform\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "from vertexai.preview import reasoning_engines\n", + "\n", + "# Evaluate agent\n", + "from vertexai.preview.evaluation import EvalTask\n", + "from vertexai.preview.evaluation.metrics import (\n", + " PointwiseMetric,\n", + " PointwiseMetricPromptTemplate,\n", + " TrajectorySingleToolUse,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MVnBDX54gz7j" + }, + "source": [ + "## Define helper functions\n", + "\n", + "Initiate a set of helper functions to print tutorial results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uSgWjMD_g1_v" + }, + "outputs": [], + "source": [ + "def get_id(length: int = 8) -> str:\n", + " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", + " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", + "\n", + "\n", + "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", + " \"\"\"Display the evaluation results.\"\"\"\n", + " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", + " display(Markdown(\"### Summary Metrics\"))\n", + " display(metrics_df)\n", + "\n", + " display(Markdown(f\"### Row-wise Metrics\"))\n", + " display(eval_result.metrics_table)\n", + "\n", + "\n", + "def display_drilldown(row: pd.Series) -> None:\n", + " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", + "\n", + " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", + "\n", + " if not (\n", + " isinstance(row[\"predicted_trajectory\"], list)\n", + " and isinstance(row[\"reference_trajectory\"], list)\n", + " ):\n", + " return\n", + "\n", + " for predicted_trajectory, reference_trajectory in zip(\n", + " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", + " ):\n", + " display(\n", + " HTML(\n", + " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", + " )\n", + " )\n", + "\n", + " if not (\n", + " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", + " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", + " ):\n", + " continue\n", + "\n", + " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", + " print(\"Tool Input Key: \", tool_input_key)\n", + "\n", + " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " reference_trajectory[\"tool_input\"][tool_input_key],\n", + " )\n", + " else:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " \"N/A\",\n", + " )\n", + " print(\"\\n\")\n", + " display(HTML(\"
\"))\n", + "\n", + "\n", + "def display_dataframe_rows(\n", + " df: pd.DataFrame,\n", + " columns: list[str] | None = None,\n", + " num_rows: int = 3,\n", + " display_drilldown: bool = False,\n", + ") -> None:\n", + " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", + "\n", + " if columns:\n", + " df = df[columns]\n", + "\n", + " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", + " header_style = base_style + \"font-weight: bold;\"\n", + "\n", + " for _, row in df.head(num_rows).iterrows():\n", + " for column in df.columns:\n", + " display(\n", + " HTML(\n", + " f\"{column.replace('_', ' ').title()}: \"\n", + " )\n", + " )\n", + " display(HTML(f\"{row[column]}
\"))\n", + "\n", + " display(HTML(\"
\"))\n", + "\n", + " if (\n", + " display_drilldown\n", + " and \"predicted_trajectory\" in df.columns\n", + " and \"reference_trajectory\" in df.columns\n", + " ):\n", + " display_drilldown(row)\n", + "\n", + "\n", + "def plot_bar_plot(\n", + " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", + ") -> None:\n", + " fig = go.Figure()\n", + " data = []\n", + "\n", + " summary_metrics = eval_result.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " data.append(\n", + " go.Bar(\n", + " x=list(summary_metrics.keys()),\n", + " y=list(summary_metrics.values()),\n", + " name=title,\n", + " )\n", + " )\n", + "\n", + " fig = go.Figure(data=data)\n", + "\n", + " # Change the bar mode\n", + " fig.update_layout(barmode=\"group\")\n", + " fig.show()\n", + "\n", + "\n", + "def display_radar_plot(eval_results, title: str, metrics=None):\n", + " \"\"\"Plot the radar plot.\"\"\"\n", + " fig = go.Figure()\n", + " summary_metrics = eval_results.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " min_val = min(summary_metrics.values())\n", + " max_val = max(summary_metrics.values())\n", + "\n", + " fig.add_trace(\n", + " go.Scatterpolar(\n", + " r=list(summary_metrics.values()),\n", + " theta=list(summary_metrics.keys()),\n", + " fill=\"toself\",\n", + " name=title,\n", + " )\n", + " )\n", + " fig.update_layout(\n", + " title=title,\n", + " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", + " showlegend=True,\n", + " )\n", + " fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bDaa2Mtsifmq" + }, + "source": [ + "## Build and deploy a LangChain agent using Vertex AI Reasoning Engine's prebuilt template\n", + "\n", + "Build and deploy your application using LangChain, including the Gemini model and custom tools that you define.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KHwShhpOitKp" + }, + "source": [ + "### Set tools\n", + "\n", + "To start, create some tools the agent will need to do their job. We are just going to pretend there's a database for this colab, but you would wire into your database or third party system for a real agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gA2ZKvfeislw" + }, + "outputs": [], + "source": [ + "def get_product_details(product_name: str):\n", + " \"\"\"Gathers basic details about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", + " \"usb charger\": \"A super fast and light usb charger\",\n", + " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", + " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", + " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " }\n", + " return details.get(product_name, \"Product details not found.\")\n", + "\n", + "\n", + "def get_product_price(product_name: str):\n", + " \"\"\"Gathers price about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": 500,\n", + " \"usb charger\": 10,\n", + " \"shoes\": 100,\n", + " \"headphones\": 50,\n", + " \"speaker\": 80,\n", + " }\n", + " return details.get(product_name, \"Product price not found.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l4mk5XPui4Y1" + }, + "source": [ + "### Set the model\n", + "\n", + "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BaYeo6K2i-w1" + }, + "outputs": [], + "source": [ + "model = \"gemini-1.5-pro\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNlAY9cojEWz" + }, + "source": [ + "### Assemble the agent\n", + "\n", + "To create a LangChain agent using [Vertex AI Reasoning Engine](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy), use the LangchainAgent class. This class helps you quickly get an agent running with a standard template. Think of it as a shortcut for building agents – you don't have to start from scratch. The LangchainAgent handles the basic structure and initial configuration, allowing you to get right into using the agent.\n", + "\n", + "> Note the additional parameter `agent_executor_kwargs` which would allow to return tool calls made by agent so you can evaluate them.\n", + "\n", + "The Vertex AI GenAI Evaluation works directly with 'Queryable' agents (like in this case), and also lets you add your own custom functions with a specific structure (signature)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dAFdi7SujGP8" + }, + "outputs": [], + "source": [ + "local_1p_agent = reasoning_engines.LangchainAgent(\n", + " model=model,\n", + " tools=[get_product_details, get_product_price],\n", + " agent_executor_kwargs={\"return_intermediate_steps\": True},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_HGcs6PVjRj_" + }, + "source": [ + "### Test the local agent\n", + "\n", + "Query your agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lGb58OJkjUs9" + }, + "outputs": [], + "source": [ + "response = local_1p_agent.query(input=\"Get product details for shoes\")\n", + "display(Markdown(response[\"output\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "INqf60zPWP6L" + }, + "outputs": [], + "source": [ + "response = local_1p_agent.query(input=\"Get product price for shoes\")\n", + "display(Markdown(response[\"output\"]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dP5g16W1rzMI" + }, + "source": [ + "### Deploy the local agent to Vertex AI Reasoning Engine\n", + "\n", + "To deploy the local agent on Vertex AI Reasoning Engine, you can use the `create` method by passing the agent and some specify dependencies (`requirements` for external PyPI packages and `extra_packages` for local packages ).\n", + "\n", + "Look at [Deploy the application](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy#create_a_reasoningengine_instance) documentation page to learn more. \n", + "\n", + "> The agent deployment on Vertex AI Reasoning Engine would require ~ 10 mins." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GPNpD676r6T2" + }, + "outputs": [], + "source": [ + "remote_1p_agent = reasoning_engines.ReasoningEngine.create(\n", + " local_1p_agent,\n", + " requirements=[\n", + " \"google-cloud-aiplatform[langchain,reasoningengine]\",\n", + " \"langchain_google_vertexai\",\n", + " \"cloudpickle==3.0.0\",\n", + " \"pydantic==2.7.4\",\n", + " \"requests\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GjZMd82vHRh3" + }, + "source": [ + "### Test the remote agent\n", + "\n", + "Query your remote agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KSCznbhbHRh3" + }, + "outputs": [], + "source": [ + "response = remote_1p_agent.query(input=\"Get product details for shoes\")\n", + "display(Markdown(response[\"output\"]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aOGPePsorpUl" + }, + "source": [ + "## Evaluating an agent with Vertex AI Gen AI Evaluation\n", + "\n", + "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", + "\n", + "Monitoring focuses on how well your agent is performing specific tasks:\n", + "\n", + "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", + "\n", + "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", + "\n", + "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", + "\n", + "Observability is about understanding the overall health of the agent:\n", + "\n", + "* **Latency**: How long does it take the agent to respond?\n", + "\n", + "* **Failure Rate**: How often does the agent fail to produce a response?\n", + "\n", + "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e43229f3ad4f" + }, + "source": [ + "### Prepare Agent Evaluation dataset\n", + "\n", + "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", + "\n", + "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", + "\n", + "> Optionally, you can provide both generated responses and predicted trajectory (**bring-your-own-dataset scenario**).\n", + "\n", + "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fFf8uTdUiDt3" + }, + "outputs": [], + "source": [ + "eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + "}\n", + "\n", + "eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PQEI1EcfvFHb" + }, + "source": [ + "Print some samples from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EjsonqWWvIvE" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(eval_sample_dataset, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m4CvBuf1afHG" + }, + "source": [ + "### Single tool usage evaluation\n", + "\n", + "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_rS5GGKHd5bx" + }, + "source": [ + "#### Set single tool usage metrics\n", + "\n", + "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", + "\n", + "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xixvq8dwd5by" + }, + "outputs": [], + "source": [ + "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ktKZoT2Qd5by" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using the remote agent and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QaMf9dqzySE6" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SRv43fDcd5by" + }, + "outputs": [], + "source": [ + "single_tool_call_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=single_tool_usage_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", + " runnable=remote_1p_agent, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(single_tool_call_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6o5BjSTFKVMS" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Use some helper functions to visualize a sample of evaluation result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1Jopzw83k14w" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JlujdJpu5Kn6" + }, + "source": [ + "### Trajectory Evaluation\n", + "\n", + "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8s-nHdDJneHM" + }, + "source": [ + "#### Set trajectory metrics\n", + "\n", + "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", + "\n", + "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", + "\n", + "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", + "\n", + "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", + "\n", + "* `trajectory_precision`: proportion of predicted actions present in reference\n", + "\n", + "* `trajectory_recall`: proportion of reference actions present in predicted. \n", + "\n", + "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c32WIS95neHN" + }, + "outputs": [], + "source": [ + "trajectory_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"trajectory_any_order_match\",\n", + " \"trajectory_precision\",\n", + " \"trajectory_recall\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF3jhTH3neHN" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vOdS7TJUneHN" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"trajectory-{get_id()}\"\n", + "\n", + "trajectory_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=remote_1p_agent)\n", + "\n", + "display_eval_report(trajectory_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DBiUI3LyLBtj" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Print and visualize a sample of evaluation results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sLVRdN5llA0h" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PrxM5sMZYXHP" + }, + "outputs": [], + "source": [ + "plot_bar_plot(\n", + " trajectory_eval_result,\n", + " title=\"Trajectory Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T8TipU2akHEd" + }, + "source": [ + "### Evaluate final response\n", + "\n", + "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DeK-py7ykkDN" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", + "\n", + "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cyGHGgeVklvz" + }, + "outputs": [], + "source": [ + "response_metrics = [\"safety\", \"coherence\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DaBJWcg1kn55" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wRb2EC_hknSD" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"response-{get_id()}\"\n", + "\n", + "response_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "response_eval_result = response_eval_task.evaluate(runnable=remote_1p_agent)\n", + "\n", + "display_eval_report(response_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JtewTwiwg9qH" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZODTRuq2lF75" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ntRBK3Te6PEc" + }, + "source": [ + "### Evaluate generated response conditioned by tool choosing.\n", + "\n", + "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", + "\n", + "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4bENwFcd6prX" + }, + "source": [ + "#### Define a custom metric\n", + "\n", + "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", + "\n", + "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "txGEHcg76riI" + }, + "outputs": [], + "source": [ + "criteria = {\n", + " \"Follows trajectory\": (\n", + " \"Evaluate whether the agent's response logically follows from the \"\n", + " \"sequence of actions it took. Consider these sub-points:\\n\"\n", + " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", + " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", + " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", + " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", + " )\n", + "}\n", + "\n", + "pointwise_rating_rubric = {\n", + " \"1\": \"Follows trajectory\",\n", + " \"0\": \"Does not follow trajectory\",\n", + "}\n", + "\n", + "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", + " criteria=criteria,\n", + " rating_rubric=pointwise_rating_rubric,\n", + " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8MJqXu0kikxd" + }, + "source": [ + "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5EL7iEDMikNQ" + }, + "outputs": [], + "source": [ + "print(response_follows_trajectory_prompt_template.prompt_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e1djVp7Fi4Yy" + }, + "source": [ + "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nx1xbZD87iMj" + }, + "outputs": [], + "source": [ + "response_follows_trajectory_metric = PointwiseMetric(\n", + " metric=\"response_follows_trajectory\",\n", + " metric_prompt_template=response_follows_trajectory_prompt_template,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1pmxLwTe7Ywv" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "Set new generated response evaluation metrics by including the custom metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wrsbVFDd7Ywv" + }, + "outputs": [], + "source": [ + "response_tool_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"safety\",\n", + " response_follows_trajectory_metric,\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lo-Sza807Ywv" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_dkb4gSn7Ywv" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"response-over-tools-{get_id()}\"\n", + "\n", + "response_eval_tool_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "response_eval_tool_result = response_eval_tool_task.evaluate(runnable=remote_1p_agent)\n", + "\n", + "display_eval_report(response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AtOfIFi2j88g" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GH2YvXgLlLH7" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4nuUDP3a2eTB" + }, + "source": [ + "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", + "\n", + "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pNh3-NDuZGDl" + }, + "source": [ + "### Bring your own evaluation dataset\n", + "\n", + "Define the evaluation dataset with the predicted trajectory and the generated response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y9hBgsg324Ej" + }, + "outputs": [], + "source": [ + "byod_eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"generated_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"response\": [\n", + " 500,\n", + " 50,\n", + " \"A super fast and light usb charger\",\n", + " 100,\n", + " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " ],\n", + "}\n", + "\n", + "byod_eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oEYmU2eJ7q-1" + }, + "source": [ + "### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wBD-4wpB7q-3" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", + "\n", + "byod_response_eval_tool_task = EvalTask(\n", + " dataset=byod_eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", + " runnable=remote_1p_agent, experiment_run_name=EXPERIMENT_RUN_NAME\n", + ")\n", + "\n", + "display_eval_report(byod_response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9eU3LG6r7q-3" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pQFzmd2I7q-3" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0FEbvEOkZS8f" + }, + "outputs": [], + "source": [ + "display_radar_plot(\n", + " byod_response_eval_tool_result,\n", + " title=\"Response Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2a4e033321ad" + }, + "source": [ + "## Cleaning up\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ox2I3UfRlTOd" + }, + "outputs": [], + "source": [ + "delete_experiment = True\n", + "delete_remote_agent = True\n", + "\n", + "if delete_experiment:\n", + " try:\n", + " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", + " experiment.delete(delete_backing_tensorboard_runs=True)\n", + " except Exception as e:\n", + " print(e)\n", + "\n", + "if delete_remote_agent:\n", + " try:\n", + " remote_1p_agent.delete()\n", + " except Exception as e:\n", + " print(e)" + ] + } + ], + "metadata": { + "colab": { + "name": "evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb b/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb index fb08f68a52..9b8b769671 100644 --- a/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb +++ b/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb @@ -1,9119 +1,9100 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ur8xi4C7S06n" - }, - "outputs": [], - "source": [ - "# Copyright 2024 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JAPoU8Sm5E6e" - }, - "source": [ - "# Evaluate an LangGraph agent on Vertex AI Reasoning Engine (Customized template)\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \"Google
Open in Colab\n", - "
\n", - "
\n", - " \n", - " \"Google
Open in Colab Enterprise\n", - "
\n", - "
\n", - " \n", - " \"Vertex
Open in Vertex AI Workbench\n", - "
\n", - "
\n", - " \n", - " \"GitHub
View on GitHub\n", - "
\n", - "
\n", - "\n", - "
\n", - "\n", - "Share to:\n", - "\n", - "\n", - " \"LinkedIn\n", - "\n", - "\n", - "\n", - " \"Bluesky\n", - "\n", - "\n", - "\n", - " \"X\n", - "\n", - "\n", - "\n", - " \"Reddit\n", - "\n", - "\n", - "\n", - " \"Facebook\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "84f0f73a0f76" - }, - "source": [ - "| | |\n", - "|-|-|\n", - "| Authors | [Naveksha Sood](https://github.com/navekshasood) [Ivan Nardini](https://github.com/inardini) |" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tvgnzT1CKxrO" - }, - "source": [ - "## Overview\n", - "\n", - "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", - "\n", - "This tutorial shows how to evaluate an LangGraph agent with customized template on Vertex AI Reasoning Engine using Vertex AI Gen AI Evaluation.\n", - "\n", - "The tutorial uses the following Google Cloud services and resources:\n", - "\n", - "* Vertex AI Gen AI Evaluation\n", - "* Vertex AI Reasoning Engine\n", - "\n", - "The steps performed include:\n", - "\n", - "* Build and deploy an agent using LangGraph on Vertex AI Reasoning Engine\n", - "* Prepare Agent Evaluation dataset\n", - "* Single tool usage evaluation\n", - "* Trajectory evaluation\n", - "* Response evaluation\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "61RBz8LLbxCR" - }, - "source": [ - "## Get started" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "No17Cw5hgx12" - }, - "source": [ - "### Install Vertex AI SDK and other required packages\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tFy3H3aPgx12" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.0/42.0 kB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.3/67.3 kB\u001b[0m \u001b[31m4.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", - " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", - " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m48.5/48.5 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m192.0/192.0 kB\u001b[0m \u001b[31m7.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m468.0/468.0 kB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m131.8/131.8 kB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m628.3/628.3 kB\u001b[0m \u001b[31m15.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m22.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m147.8/147.8 kB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m211.4/211.4 kB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.1/53.1 kB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m29.9/29.9 MB\u001b[0m \u001b[31m31.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m33.5/33.5 MB\u001b[0m \u001b[31m14.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.5/6.5 MB\u001b[0m \u001b[31m72.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.8/55.8 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.2/59.2 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m83.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m319.7/319.7 kB\u001b[0m \u001b[31m19.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.6/57.6 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m756.0/756.0 kB\u001b[0m \u001b[31m38.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.7/9.7 MB\u001b[0m \u001b[31m76.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.0/15.0 MB\u001b[0m \u001b[31m55.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m233.5/233.5 kB\u001b[0m \u001b[31m13.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m278.6/278.6 kB\u001b[0m \u001b[31m16.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m249.9/249.9 kB\u001b[0m \u001b[31m14.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m94.8/94.8 kB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m131.6/131.6 kB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m327.6/327.6 kB\u001b[0m \u001b[31m17.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.9/1.9 MB\u001b[0m \u001b[31m63.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.3/44.3 kB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.5/2.5 MB\u001b[0m \u001b[31m62.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.7/50.7 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m311.8/311.8 kB\u001b[0m \u001b[31m18.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m83.2/83.2 kB\u001b[0m \u001b[31m5.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m93.2/93.2 kB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.3/13.3 MB\u001b[0m \u001b[31m51.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.8/54.8 kB\u001b[0m \u001b[31m3.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m298.0/298.0 kB\u001b[0m \u001b[31m16.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.8/2.8 MB\u001b[0m \u001b[31m56.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m71.1/71.1 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m41.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.0/3.0 MB\u001b[0m \u001b[31m65.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m481.7/481.7 kB\u001b[0m \u001b[31m23.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.3/62.3 kB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m56.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m442.1/442.1 kB\u001b[0m \u001b[31m21.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m42.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m209.0/209.0 kB\u001b[0m \u001b[31m13.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m267.2/267.2 kB\u001b[0m \u001b[31m14.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m73.2/73.2 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.8/3.8 MB\u001b[0m \u001b[31m57.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m443.8/443.8 kB\u001b[0m \u001b[31m26.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.6/78.6 kB\u001b[0m \u001b[31m5.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m49.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m6.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.5/49.5 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.5/57.5 kB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Building wheel for docx2txt (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Building wheel for pypika (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[33m WARNING: The script uvicorn is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script pytube is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script dotenv is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script pypdfium2 is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script nodeenv is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script mako-render is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script json_repair is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script humanfriendly is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script fastavro is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script watchfiles is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The scripts pyright, pyright-langserver, pyright-python and pyright-python-langserver are installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script gptcache_server is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script coloredlogs is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script pyproject-build is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script alembic is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script onnxruntime_test is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script langsmith is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script fastapi is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script pdfplumber is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script litellm is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script instructor is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script chroma is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script ec is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script crewai is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "tensorflow 2.17.1 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 5.29.1 which is incompatible.\n", - "tensorflow-metadata 1.13.1 requires protobuf<5,>=3.20.3, but you have protobuf 5.29.1 which is incompatible.\n", - "transformers 4.47.0 requires tokenizers<0.22,>=0.21, but you have tokenizers 0.20.3 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0m" - ] - } - ], - "source": [ - "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation, langchain, reasoningengine]\" \\\n", - " \"langchain_google_vertexai\" \\\n", - " \"langgraph\" \\\n", - " \"cloudpickle==3.0.0\" \\\n", - " \"pydantic==2.7.4\" \\\n", - " \"requests\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "R5Xep4W9lq-Z" - }, - "source": [ - "### Restart runtime\n", - "\n", - "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", - "\n", - "The restart might take a minute or longer. After it's restarted, continue to the next step." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XRvKdaPDTznN" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'status': 'ok', 'restart': True}" - ] - }, - "execution_count": 58, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import IPython\n", - "\n", - "app = IPython.Application.instance()\n", - "app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SbmM4z7FOBpM" - }, - "source": [ - "
\n", - "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", - "
\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dmWOrTJ3gx13" - }, - "source": [ - "### Authenticate your notebook environment (Colab only)\n", - "\n", - "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NyKGtVQjgx13" - }, - "outputs": [], - "source": [ - "import sys\n", - "\n", - "if \"google.colab\" in sys.modules:\n", - " from google.colab import auth\n", - "\n", - " auth.authenticate_user()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF4l8DTdWgPY" - }, - "source": [ - "### Set Google Cloud project information and initialize Vertex AI SDK\n", - "\n", - "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", - "\n", - "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nqwi-5ufWp_B" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Creating gs://evaluate_agents/...\n", - "ServiceException: 409 A Cloud Storage bucket named 'evaluate_agents' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Use the environment variable if the user doesn't provide Project ID.\n", - "import os\n", - "\n", - "import vertexai\n", - "\n", - "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", - "\n", - "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", - " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", - "\n", - "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", - "\n", - "BUCKET_NAME = \"[your-bucket-name]\" # @param {type: \"string\", placeholder: \"[your-bucket-name]\", isTemplate: true}\n", - "\n", - "if not BUCKET_NAME or BUCKET_NAME == \"[your-bucket-name]\":\n", - " BUCKET_NAME = f\"{PROJECT_ID}-bucket\"\n", - "\n", - "BUCKET_URI = f\"gs://evaluate_agents\"\n", - "\n", - "! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI\n", - "\n", - "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", - "\n", - "vertexai.init(\n", - " project=PROJECT_ID,\n", - " location=LOCATION,\n", - " staging_bucket=BUCKET_URI,\n", - " experiment=EXPERIMENT_NAME,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5303c05f7aa6" - }, - "source": [ - "## Import libraries\n", - "\n", - "Import tutorial libraries." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6fc324893334" - }, - "outputs": [], - "source": [ - "# General\n", - "import random\n", - "import string\n", - "from typing import Literal\n", - "\n", - "from IPython.display import HTML, Markdown, display\n", - "\n", - "# Build agent\n", - "from crewai_tools import tool\n", - "\n", - "# Evaluate agent\n", - "from google.cloud import aiplatform\n", - "import pandas as pd\n", - "import plotly.graph_objects as go\n", - "from vertexai.preview import reasoning_engines\n", - "from vertexai.preview.evaluation import EvalTask\n", - "from vertexai.preview.evaluation.metrics import (\n", - " PointwiseMetric,\n", - " PointwiseMetricPromptTemplate,\n", - " TrajectorySingleToolUse,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MVnBDX54gz7j" - }, - "source": [ - "## Define helper functions\n", - "\n", - "Initiate a set of helper functions to print tutorial results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uSgWjMD_g1_v" - }, - "outputs": [], - "source": [ - "def get_id(length: int = 8) -> str:\n", - " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", - " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", - "\n", - "\n", - "def parse_crewai_output_to_dictionary(crew, crew_output):\n", - " \"\"\"\n", - " Parse CrewAI output into a structured dictionary format.\n", - " \"\"\"\n", - " final_output = {\"response\": str(crew_output), \"predicted_trajectory\": []}\n", - "\n", - " try:\n", - " # Access tools_results directly from each agent\n", - " for agent in crew.agents:\n", - " if hasattr(agent, \"tools_results\"):\n", - " for tool_result in agent.tools_results:\n", - " tool_info = {\n", - " \"tool_name\": tool_result.get(\"tool_name\", \"\"),\n", - " \"tool_input\": tool_result.get(\"tool_args\", {}),\n", - " }\n", - " final_output[\"predicted_trajectory\"].append(tool_info)\n", - "\n", - " except Exception as e:\n", - " final_output[\"error\"] = f\"Error parsing tools results: {str(e)}\"\n", - "\n", - " return final_output\n", - "\n", - "\n", - "def format_output_as_markdown(output: dict) -> str:\n", - " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", - " markdown = \"### AI Response\\n\"\n", - " markdown += f\"{output['response']}\\n\\n\"\n", - "\n", - " if output[\"predicted_trajectory\"]:\n", - " markdown += \"### Function Calls\\n\"\n", - " for call in output[\"predicted_trajectory\"]:\n", - " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", - " markdown += \" - **Arguments**:\\n\"\n", - " for key, value in call[\"tool_input\"].items():\n", - " markdown += f\" - `{key}`: `{value}`\\n\"\n", - "\n", - " return markdown\n", - "\n", - "\n", - "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", - " \"\"\"Display the evaluation results.\"\"\"\n", - " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", - " display(Markdown(\"### Summary Metrics\"))\n", - " display(metrics_df)\n", - "\n", - " display(Markdown(f\"### Row-wise Metrics\"))\n", - " display(eval_result.metrics_table)\n", - "\n", - "\n", - "def display_drilldown(row: pd.Series) -> None:\n", - " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", - "\n", - " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", - "\n", - " if not (\n", - " isinstance(row[\"predicted_trajectory\"], list)\n", - " and isinstance(row[\"reference_trajectory\"], list)\n", - " ):\n", - " return\n", - "\n", - " for predicted_trajectory, reference_trajectory in zip(\n", - " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", - " ):\n", - " display(\n", - " HTML(\n", - " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", - " )\n", - " )\n", - "\n", - " if not (\n", - " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", - " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", - " ):\n", - " continue\n", - "\n", - " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", - " print(\"Tool Input Key: \", tool_input_key)\n", - "\n", - " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", - " print(\n", - " \"Tool Values: \",\n", - " predicted_trajectory[\"tool_input\"][tool_input_key],\n", - " reference_trajectory[\"tool_input\"][tool_input_key],\n", - " )\n", - " else:\n", - " print(\n", - " \"Tool Values: \",\n", - " predicted_trajectory[\"tool_input\"][tool_input_key],\n", - " \"N/A\",\n", - " )\n", - " print(\"\\n\")\n", - " display(HTML(\"
\"))\n", - "\n", - "\n", - "def display_dataframe_rows(\n", - " df: pd.DataFrame,\n", - " columns: list[str] | None = None,\n", - " num_rows: int = 3,\n", - " display_drilldown: bool = False,\n", - ") -> None:\n", - " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", - "\n", - " if columns:\n", - " df = df[columns]\n", - "\n", - " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", - " header_style = base_style + \"font-weight: bold;\"\n", - "\n", - " for _, row in df.head(num_rows).iterrows():\n", - " for column in df.columns:\n", - " display(\n", - " HTML(\n", - " f\"{column.replace('_', ' ').title()}: \"\n", - " )\n", - " )\n", - " display(HTML(f\"{row[column]}
\"))\n", - "\n", - " display(HTML(\"
\"))\n", - "\n", - " if (\n", - " display_drilldown\n", - " and \"predicted_trajectory\" in df.columns\n", - " and \"reference_trajectory\" in df.columns\n", - " ):\n", - " display_drilldown(row)\n", - "\n", - "\n", - "def plot_bar_plot(\n", - " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", - ") -> None:\n", - " fig = go.Figure()\n", - " data = []\n", - "\n", - " summary_metrics = eval_result.summary_metrics\n", - " if metrics:\n", - " summary_metrics = {\n", - " k: summary_metrics[k]\n", - " for k, v in summary_metrics.items()\n", - " if any(selected_metric in k for selected_metric in metrics)\n", - " }\n", - "\n", - " data.append(\n", - " go.Bar(\n", - " x=list(summary_metrics.keys()),\n", - " y=list(summary_metrics.values()),\n", - " name=title,\n", - " )\n", - " )\n", - "\n", - " fig = go.Figure(data=data)\n", - "\n", - " # Change the bar mode\n", - " fig.update_layout(barmode=\"group\")\n", - " fig.show()\n", - "\n", - "\n", - "def display_radar_plot(eval_results, title: str, metrics=None):\n", - " \"\"\"Plot the radar plot.\"\"\"\n", - " fig = go.Figure()\n", - " summary_metrics = eval_results.summary_metrics\n", - " if metrics:\n", - " summary_metrics = {\n", - " k: summary_metrics[k]\n", - " for k, v in summary_metrics.items()\n", - " if any(selected_metric in k for selected_metric in metrics)\n", - " }\n", - "\n", - " min_val = min(summary_metrics.values())\n", - " max_val = max(summary_metrics.values())\n", - "\n", - " fig.add_trace(\n", - " go.Scatterpolar(\n", - " r=list(summary_metrics.values()),\n", - " theta=list(summary_metrics.keys()),\n", - " fill=\"toself\",\n", - " name=title,\n", - " )\n", - " )\n", - " fig.update_layout(\n", - " title=title,\n", - " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", - " showlegend=True,\n", - " )\n", - " fig.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bDaa2Mtsifmq" - }, - "source": [ - "## Build an agent using Vertex AI Reasoning Engine's customized template\n", - "\n", - "Build and deploy your application using LangGraph, including the Gemini model and custom tools that you define.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KHwShhpOitKp" - }, - "source": [ - "### Set tools\n", - "\n", - "To start, set the tools that a customer support agent needs to do their job." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gA2ZKvfeislw" - }, - "outputs": [], - "source": [ - "@tool\n", - "def get_product_details(product_name: str):\n", - " \"\"\"Gathers basic details about a product.\"\"\"\n", - " details = {\n", - " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", - " \"usb charger\": \"A super fast and light usb charger\",\n", - " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", - " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", - " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", - " }\n", - " return details.get(product_name, \"Product details not found.\")\n", - "\n", - "\n", - "@tool\n", - "def get_product_price(product_name: str):\n", - " \"\"\"Gathers price about a product.\"\"\"\n", - " details = {\n", - " \"smartphone\": 500,\n", - " \"usb charger\": 10,\n", - " \"shoes\": 100,\n", - " \"headphones\": 50,\n", - " \"speaker\": 80,\n", - " }\n", - " return details.get(product_name, \"Product price not found.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "be70714d9fae" - }, - "source": [ - "### Define router\n", - "\n", - "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "516b5108d327" - }, - "outputs": [], - "source": [ - "def router(\n", - " state: list[BaseMessage],\n", - ") -> Literal[\"get_product_details\", \"get_product_price\", \"__end__\"]:\n", - " \"\"\"Initiates product details or price retrieval if the user asks for a product.\"\"\"\n", - " # Get the tool_calls from the last message in the conversation history.\n", - " tool_calls = state[-1].tool_calls\n", - "\n", - " # If there are any tool_calls\n", - " if tool_calls:\n", - " # Check the function name in the first tool call\n", - " function_name = tool_calls[0].get(\"name\")\n", - " if function_name == \"get_product_price\":\n", - " return \"get_product_price\"\n", - " else:\n", - " return \"get_product_details\"\n", - " else:\n", - " # End the conversation flow.\n", - " return \"__end__\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FHjhBVx2cHWb" - }, - "source": [ - "### Set the model\n", - "\n", - "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "iCx9hbpccHWc" - }, - "outputs": [], - "source": [ - "model = \"gemini-1.5-pro\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tNlAY9cojEWz" - }, - "source": [ - "### Assemble the agent\n", - "\n", - "To create a LangGraph agent using [Vertex AI Reasoning Engine](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy), use the [customized template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize).\n", - "\n", - "This class helps you quickly get an agent using any framework running Vertex AI Reasoning Engine.\n", - "\n", - "To learn more about the template, check out [Customize an application template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize) documentation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dAFdi7SujGP8" - }, - "outputs": [], - "source": [ - "class LangGraphApp:\n", - " def __init__(self, project: str, location: str, model: str = model) -> None:\n", - " self.project_id = project\n", - " self.location = location\n", - " self.model = model\n", - "\n", - " # The set_up method is used to define application initialization logic\n", - " def set_up(self) -> None:\n", - " model = ChatVertexAI(model=self.model)\n", - " builder = MessageGraph()\n", - "\n", - " model_with_tools = model.bind_tools([get_product_details, get_product_price])\n", - " builder.add_node(\"tools\", model_with_tools)\n", - "\n", - " tool_node = ToolNode([get_product_details, get_product_price])\n", - " builder.add_node(\"get_product_details\", tool_node)\n", - " builder.add_node(\"get_product_price\", tool_node)\n", - " builder.add_edge(\"get_product_details\", END)\n", - " builder.add_edge(\"get_product_price\", END)\n", - "\n", - " builder.set_entry_point(\"tools\")\n", - " builder.add_conditional_edges(\"tools\", router)\n", - " self.app = builder.compile()\n", - "\n", - " # The query method will be used to send inputs to the agent\n", - " def query(self, input: str):\n", - " \"\"\"Query the application.\"\"\"\n", - " chat_history = langchain_load_dump.dumpd(self.app.invoke(HumanMessage(input)))\n", - " return chat_history\n", - " # return {'output': parse_messages_to_output_dictionary(chat_history)}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_HGcs6PVjRj_" - }, - "source": [ - "### Test the local agent\n", - "\n", - "Query your agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1dXLLgBudu_L" - }, - "outputs": [], - "source": [ - "local_custom_agent = LangGraphApp(project=PROJECT_ID, location=LOCATION)\n", - "local_custom_agent.set_up()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PgkOhPmN3aCZ" - }, - "outputs": [ - { - "data": { - "text/markdown": [ - "### AI Response\n", - "High-performance running shoes designed for comfort, support, and speed.\n", - "\n", - "### Function Calls\n", - "- **Function**: `get_product_details`\n", - " - **Arguments**:\n", - " - `product_name`: `shoes`\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "response = local_custom_agent.query(input=\"Get product details for shoes\")\n", - "display(\n", - " Markdown(format_output_as_markdown(parse_messages_to_output_dictionary(response)))\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "lGb58OJkjUs9" - }, - "outputs": [ - { - "data": { - "text/markdown": [ - "### AI Response\n", - "100\n", - "\n", - "### Function Calls\n", - "- **Function**: `get_product_price`\n", - " - **Arguments**:\n", - " - `product_name`: `shoes`\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "response = local_custom_agent.query(input=\"Get product price for shoes\")\n", - "display(\n", - " Markdown(format_output_as_markdown(parse_messages_to_output_dictionary(response)))\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2pSItXD5e4QD" - }, - "source": [ - "### Deploy the local agent to Vertex AI Reasoning Engine\n", - "\n", - "To deploy the local agent on Vertex AI Reasoning Engine, you can use the `create` method by passing the agent and some specify dependencies (`requirements` for external PyPI packages and `extra_packages` for local packages ).\n", - "\n", - "Look at [Deploy the application](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy#create_a_reasoningengine_instance) documentation page to learn more. \n", - "\n", - "> The agent deployment on Vertex AI Reasoning Engine would require ~ 10 mins." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "3HLz_a1We4QE" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:vertexai.reasoning_engines._reasoning_engines:Using bucket evaluate_agents\n", - "INFO:vertexai.reasoning_engines._reasoning_engines:Writing to gs://evaluate_agents/reasoning_engine/reasoning_engine.pkl\n", - "INFO:vertexai.reasoning_engines._reasoning_engines:Writing to gs://evaluate_agents/reasoning_engine/requirements.txt\n", - "INFO:vertexai.reasoning_engines._reasoning_engines:Creating in-memory tarfile of extra_packages\n", - "INFO:vertexai.reasoning_engines._reasoning_engines:Writing to gs://evaluate_agents/reasoning_engine/dependencies.tar.gz\n", - "INFO:vertexai.reasoning_engines._reasoning_engines:Creating ReasoningEngine\n", - "INFO:vertexai.reasoning_engines._reasoning_engines:Create ReasoningEngine backing LRO: projects/801452371447/locations/us-central1/reasoningEngines/1480048204102762496/operations/5878089664325222400\n", - "INFO:vertexai.reasoning_engines._reasoning_engines:ReasoningEngine created. Resource name: projects/801452371447/locations/us-central1/reasoningEngines/1480048204102762496\n", - "INFO:vertexai.reasoning_engines._reasoning_engines:To use this ReasoningEngine in another session:\n", - "INFO:vertexai.reasoning_engines._reasoning_engines:reasoning_engine = vertexai.preview.reasoning_engines.ReasoningEngine('projects/801452371447/locations/us-central1/reasoningEngines/1480048204102762496')\n" - ] - } - ], - "source": [ - "local_custom_agent = LangGraphApp(project=PROJECT_ID, location=LOCATION)\n", - "\n", - "remote_custom_agent = reasoning_engines.ReasoningEngine.create(\n", - " local_custom_agent,\n", - " requirements=[\n", - " \"google-cloud-aiplatform[langchain,reasoningengine]\",\n", - " \"langchain_google_vertexai\",\n", - " \"langgraph\",\n", - " \"cloudpickle==3.0.0\",\n", - " \"pydantic==2.7.4\",\n", - " \"requests\",\n", - " ],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nu4RO1P9e4QE" - }, - "source": [ - "### Test the remote agent\n", - "\n", - "Query your remote agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sqBtzYyce4QE" - }, - "outputs": [ - { - "data": { - "text/markdown": [ - "### AI Response\n", - "High-performance running shoes designed for comfort, support, and speed.\n", - "\n", - "### Function Calls\n", - "- **Function**: `get_product_details`\n", - " - **Arguments**:\n", - " - `product_name`: `shoes`\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "response = remote_custom_agent.query(input=\"Get product details for shoes\")\n", - "display(\n", - " Markdown(format_output_as_markdown(parse_messages_to_output_dictionary(response)))\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aOGPePsorpUl" - }, - "source": [ - "## Evaluating a custom agent with Vertex AI Gen AI Evaluation\n", - "\n", - "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", - "\n", - "Monitoring focuses on how well your agent is performing specific tasks:\n", - "\n", - "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", - "\n", - "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", - "\n", - "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", - "\n", - "Observability is about understanding the overall health of the agent:\n", - "\n", - "* **Latency**: How long does it take the agent to respond?\n", - "\n", - "* **Failure Rate**: How often does the agent fail to produce a response?\n", - "\n", - "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e43229f3ad4f" - }, - "source": [ - "### Prepare Agent Evaluation dataset\n", - "\n", - "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", - "\n", - "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", - "\n", - "\n", - "> Optionally, you can provide both generated responses and predicted trajectory (**bring-your-own-dataset scenario**).\n", - "\n", - "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fFf8uTdUiDt3" - }, - "outputs": [], - "source": [ - "eval_data = {\n", - " \"prompt\": [\n", - " \"Get price for smartphone\",\n", - " \"Get product details and price for headphones\",\n", - " \"Get details for usb charger\",\n", - " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\",\n", - " ],\n", - " \"reference_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - "}\n", - "\n", - "eval_sample_dataset = pd.DataFrame(eval_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PQEI1EcfvFHb" - }, - "source": [ - "Print some samples from the dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "EjsonqWWvIvE" - }, - "outputs": [ - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get price for smartphone
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get product details and price for headphones
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get details for usb charger
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display_dataframe_rows(eval_sample_dataset, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "htCrOS9fRVi8" - }, - "source": [ - "### Prepare an Agent function\n", - "\n", - "In this scenario with a custom agent, you need an agent function to parse the agent output and pass it to Vertex AI Gen AI Evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GdO56MIDRZri" - }, - "outputs": [], - "source": [ - "def agent_parsed_response(input: str) -> dict:\n", - " \"\"\"Parse the agent output and pass it to Vertex AI Gen AI Evaluation.\"\"\"\n", - "\n", - " result = remote_custom_agent.query(input=input)\n", - "\n", - " # Parse function calls separately\n", - " agent_output = parse_messages_to_output_dictionary(result)\n", - "\n", - " return agent_output" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "m4CvBuf1afHG" - }, - "source": [ - "### Single tool usage evaluation\n", - "\n", - "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_rS5GGKHd5bx" - }, - "source": [ - "#### Set single tool usage metrics\n", - "\n", - "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", - "\n", - "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xixvq8dwd5by" - }, - "outputs": [], - "source": [ - "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ktKZoT2Qd5by" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using a local agent (local_1p_agent) and assigns a unique identifier to this specific evaluation run, storing the evaluation results.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QaMf9dqzySE6" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SRv43fDcd5by" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/801452371447/locations/us-central1/metadataStores/default/contexts/evaluate-agent-single-metric-eval-s58mdw1j to Experiment: evaluate-agent\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment Run\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 5/5 [00:02<00:00, 1.81it/s]\n", - "INFO:vertexai.preview.evaluation._evaluation:All 5 responses are successfully generated from the runnable.\n", - "INFO:vertexai.preview.evaluation._evaluation:Computing metrics with a total of 5 Vertex Gen AI Evaluation Service API requests.\n", - "100%|██████████| 5/5 [00:04<00:00, 1.23it/s]\n", - "INFO:vertexai.preview.evaluation._evaluation:All 5 metric requests are successfully computed.\n", - "INFO:vertexai.preview.evaluation._evaluation:Evaluation Took:4.098520709000013 seconds\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "### Summary Metrics" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "summary": "{\n \"name\": \"display_eval_report(single_tool_call_eval_result)\",\n \"rows\": 1,\n \"fields\": [\n {\n \"column\": \"row_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 5.0,\n \"max\": 5.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 5.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_single_tool_use/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.6,\n \"max\": 0.6,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.6\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_single_tool_use/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.5477225575051662,\n \"max\": 0.5477225575051662,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.5477225575051662\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 2.1747785195999767,\n \"max\": 2.1747785195999767,\n \"num_unique_values\": 1,\n \"samples\": [\n 2.1747785195999767\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.5584294262336947,\n \"max\": 0.5584294262336947,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.5584294262336947\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", - "type": "dataframe" - }, - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
row_counttrajectory_single_tool_use/meantrajectory_single_tool_use/stdlatency_in_seconds/meanlatency_in_seconds/stdfailure/meanfailure/std
05.00.60.5477232.1747790.5584290.00.0
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - "
\n" - ], - "text/plain": [ - " row_count trajectory_single_tool_use/mean trajectory_single_tool_use/std \\\n", - "0 5.0 0.6 0.547723 \n", - "\n", - " latency_in_seconds/mean latency_in_seconds/std failure/mean failure/std \n", - "0 2.174779 0.558429 0.0 0.0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "### Row-wise Metrics" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "summary": "{\n \"name\": \"display_eval_report(single_tool_call_eval_result)\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"prompt\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Get product details and price for headphones\",\n \"Get product details for speaker?\",\n \"Get details for usb charger\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"reference_trajectory\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"50\",\n \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n \"A super fast and light usb charger\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 1.4841679319999912,\n \"max\": 2.7480303170000298,\n \"num_unique_values\": 5,\n \"samples\": [\n 2.7480303170000298,\n 2.3126841799999056,\n 2.624170197000012\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 0,\n \"max\": 0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"predicted_trajectory\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_single_tool_use/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5477225575051662,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", - "type": "dataframe" - }, - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
promptreference_trajectoryresponselatency_in_secondsfailurepredicted_trajectorytrajectory_single_tool_use/score
0Get price for smartphone[{'tool_name': 'get_product_price', 'tool_inpu...5001.4841680[{'tool_name': 'get_product_price', 'tool_inpu...1.0
1Get product details and price for headphones[{'tool_name': 'get_product_details', 'tool_in...502.748030[{'tool_name': 'get_product_details', 'tool_in...1.0
2Get details for usb charger[{'tool_name': 'get_product_details', 'tool_in...A super fast and light usb charger2.624170[{'tool_name': 'get_product_details', 'tool_in...0.0
3Get product details and price for shoes[{'tool_name': 'get_product_details', 'tool_in...1001.704840[{'tool_name': 'get_product_details', 'tool_in...1.0
4Get product details for speaker?[{'tool_name': 'get_product_details', 'tool_in...A voice-controlled smart speaker that plays mu...2.3126840[{'tool_name': 'get_product_details', 'tool_in...0.0
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - " \n", - "
\n", - "\n", - "
\n", - "
\n" - ], - "text/plain": [ - " prompt \\\n", - "0 Get price for smartphone \n", - "1 Get product details and price for headphones \n", - "2 Get details for usb charger \n", - "3 Get product details and price for shoes \n", - "4 Get product details for speaker? \n", - "\n", - " reference_trajectory \\\n", - "0 [{'tool_name': 'get_product_price', 'tool_inpu... \n", - "1 [{'tool_name': 'get_product_details', 'tool_in... \n", - "2 [{'tool_name': 'get_product_details', 'tool_in... \n", - "3 [{'tool_name': 'get_product_details', 'tool_in... \n", - "4 [{'tool_name': 'get_product_details', 'tool_in... \n", - "\n", - " response latency_in_seconds \\\n", - "0 500 1.484168 \n", - "1 50 2.74803 \n", - "2 A super fast and light usb charger 2.62417 \n", - "3 100 1.70484 \n", - "4 A voice-controlled smart speaker that plays mu... 2.312684 \n", - "\n", - " failure predicted_trajectory \\\n", - "0 0 [{'tool_name': 'get_product_price', 'tool_inpu... \n", - "1 0 [{'tool_name': 'get_product_details', 'tool_in... \n", - "2 0 [{'tool_name': 'get_product_details', 'tool_in... \n", - "3 0 [{'tool_name': 'get_product_details', 'tool_in... \n", - "4 0 [{'tool_name': 'get_product_details', 'tool_in... \n", - "\n", - " trajectory_single_tool_use/score \n", - "0 1.0 \n", - "1 1.0 \n", - "2 0.0 \n", - "3 1.0 \n", - "4 0.0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "single_tool_call_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=single_tool_usage_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", - " runnable=agent_parsed_response, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(single_tool_call_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6o5BjSTFKVMS" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Use some helper functions to visualize a sample of evaluation result." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZkpwPReipekr" - }, - "outputs": [ - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get price for smartphone
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "500
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Latency In Seconds: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.4841679319999912
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Failure: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Predicted Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Single Tool Use/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get product details and price for headphones
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "50
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Latency In Seconds: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "2.7480303170000298
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Failure: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Predicted Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Single Tool Use/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get details for usb charger
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "A super fast and light usb charger
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Latency In Seconds: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "2.624170197000012
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Failure: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Predicted Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Single Tool Use/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JlujdJpu5Kn6" - }, - "source": [ - "### Trajectory Evaluation\n", - "\n", - "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8s-nHdDJneHM" - }, - "source": [ - "#### Set trajectory metrics\n", - "\n", - "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", - "\n", - "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", - "\n", - "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", - "\n", - "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", - "\n", - "* `trajectory_precision`: proportion of predicted actions present in reference\n", - "\n", - "* `trajectory_recall`: proportion of reference actions present in predicted. \n", - "\n", - "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "c32WIS95neHN" - }, - "outputs": [], - "source": [ - "trajectory_metrics = [\n", - " \"trajectory_exact_match\",\n", - " \"trajectory_in_order_match\",\n", - " \"trajectory_any_order_match\",\n", - " \"trajectory_precision\",\n", - " \"trajectory_recall\",\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF3jhTH3neHN" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vOdS7TJUneHN" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/801452371447/locations/us-central1/metadataStores/default/contexts/evaluate-re-agent-trajectory-3b77ede9-8ae8-416b-9fdf-50bab4b99297 to Experiment: evaluate-re-agent-trajectory\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment Run\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 5/5 [00:02<00:00, 1.90it/s]\n", - "INFO:vertexai.preview.evaluation._evaluation:All 5 responses are successfully generated from the runnable.\n", - "INFO:vertexai.preview.evaluation._evaluation:Computing metrics with a total of 25 Vertex Gen AI Evaluation Service API requests.\n", - "100%|██████████| 25/25 [00:24<00:00, 1.04it/s]\n", - "INFO:vertexai.preview.evaluation._evaluation:All 25 metric requests are successfully computed.\n", - "INFO:vertexai.preview.evaluation._evaluation:Evaluation Took:24.113868357 seconds\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "EXPERIMENT_NAME = \"evaluate-re-agent-trajectory\" # @param {type:\"string\"}\n", - "\n", - "trajectory_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", - ")\n", - "\n", - "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=agent_parsed_response)\n", - "\n", - "display_eval_report(trajectory_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DBiUI3LyLBtj" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Print and visualize a sample of evaluation results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sLVRdN5llA0h" - }, - "outputs": [ - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get price for smartphone
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "500
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Latency In Seconds: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "2.57008658299992
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Failure: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Predicted Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Exact Match/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory In Order Match/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Any Order Match/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Precision/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Recall/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get product details and price for headphones
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "50
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Latency In Seconds: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.7254483579999942
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Failure: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Predicted Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Exact Match/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory In Order Match/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Any Order Match/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Precision/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Recall/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get details for usb charger
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "A super fast and light usb charger
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Latency In Seconds: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "2.6286665519999133
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Failure: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Predicted Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Exact Match/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory In Order Match/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Any Order Match/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Precision/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Recall/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PrxM5sMZYXHP" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "
\n", - "
\n", - "\n", - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "plot_bar_plot(\n", - " trajectory_eval_result,\n", - " title=\"Trajectory Metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "T8TipU2akHEd" - }, - "source": [ - "### Evaluate final response\n", - "\n", - "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DeK-py7ykkDN" - }, - "source": [ - "#### Set response metrics\n", - "\n", - "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", - "\n", - "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cyGHGgeVklvz" - }, - "outputs": [], - "source": [ - "response_metrics = [\"safety\", \"coherence\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DaBJWcg1kn55" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wRb2EC_hknSD" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/801452371447/locations/us-central1/metadataStores/default/contexts/evaluate-re-agent-response-350dc51f-c862-4661-a311-910720d88957 to Experiment: evaluate-re-agent-response\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment Run\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 5/5 [00:01<00:00, 2.63it/s]\n", - "INFO:vertexai.preview.evaluation._evaluation:All 5 responses are successfully generated from the runnable.\n", - "INFO:vertexai.preview.evaluation._evaluation:Computing metrics with a total of 10 Vertex Gen AI Evaluation Service API requests.\n", - "100%|██████████| 10/10 [00:13<00:00, 1.36s/it]\n", - "INFO:vertexai.preview.evaluation._evaluation:All 10 metric requests are successfully computed.\n", - "INFO:vertexai.preview.evaluation._evaluation:Evaluation Took:13.589168556999994 seconds\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "EXPERIMENT_NAME = \"evaluate-re-agent-response\" # @param {type:\"string\"}\n", - "\n", - "response_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", - ")\n", - "\n", - "response_eval_result = response_eval_task.evaluate(runnable=agent_parsed_response)\n", - "\n", - "display_eval_report(response_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JtewTwiwg9qH" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "\n", - "Print new evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cy0aRydrp9zW" - }, - "outputs": [ - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get price for smartphone
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "500
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Latency In Seconds: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.4945395349998307
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Failure: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Predicted Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "The response is safe, as it does not contain any hate speech, harassment, dangerous content, or sexually explicit content. It simply states the numeric value '500', which can represent the price of a smartphone, making it relevant to the user's prompt.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Coherence/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "STEP 1: The purpose of the prompt is to get the price of a smartphone. The intended audience is anyone looking to purchase a smartphone.\n", - "STEP 2: The response lacks any discernable structure or logical flow, as it's just a numerical value with no context.\n", - "STEP 3: Local coherence is nonexistent because there are no sentences or paragraphs to evaluate transitions.\n", - "STEP 4: The response doesn't utilize any word choices or syntax, hindering the establishment of connections between ideas.\n", - "STEP 5: Pronouns and references are absent in the response.\n", - "Therefore, the response receives the lowest rating due to the lack of any coherent elements.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Coherence/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get product details and price for headphones
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "50
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Latency In Seconds: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.8972680370000035
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Failure: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Predicted Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "The response only contains the number \"50\", which is not harmful or unsafe. Thus, it does not contain any hate speech, harassment, dangerous content, or sexually explicit content. Therefore, the response is safe.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Coherence/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "The response \"50\" lacks coherence entirely. It provides no product details, no currency for the price, and does not specify what the number 50 refers to. The user asked for product details and a price for headphones, but the response offers an isolated number without any context or relevance to the prompt. There is no logical flow, structure, or cohesion. The response does not fulfill the user's request or provide any meaningful information.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Coherence/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get details for usb charger
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "A super fast and light usb charger
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Latency In Seconds: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.5881808110000293
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Failure: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Predicted Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "The response is safe, as it does not contain any hate speech, harassment, dangerous content, or sexually explicit content.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Coherence/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "STEP 1: The prompt seeks detailed information about a USB charger. The intended audience might be anyone looking to purchase or learn more about USB chargers.\n", - "STEP 2: The response lacks a clear structure or progression of ideas. It consists of a single short, general statement, offering no details. There is no beginning, middle, or end, nor does it define 'super fast' or 'light.'\n", - "STEP 3: As the response is a single sentence, there are no transitions to assess, nor is there a paragraph structure to analyze. The sentence fails to provide details about a USB charger.\n", - "STEP 4: The word choice is simplistic and doesn't effectively convey information. Terms like 'super fast' and 'light' are vague and lack specific details.\n", - "STEP 5: There are no pronouns or references to assess.\n", - "Overall, the response is incoherent as it fails to provide the details requested by the prompt and lacks organization and structure.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Coherence/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ntRBK3Te6PEc" - }, - "source": [ - "### Evaluate generated response conditioned by tool choosing\n", - "\n", - "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", - "\n", - "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4bENwFcd6prX" - }, - "source": [ - "#### Define a custom metric\n", - "\n", - "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", - "\n", - "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "txGEHcg76riI" - }, - "outputs": [], - "source": [ - "criteria = {\n", - " \"Follows trajectory\": (\n", - " \"Evaluate whether the agent's response logically follows from the \"\n", - " \"sequence of actions it took. Consider these sub-points:\\n\"\n", - " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", - " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", - " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", - " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", - " )\n", - "}\n", - "\n", - "pointwise_rating_rubric = {\n", - " \"1\": \"Follows trajectory\",\n", - " \"0\": \"Does not follow trajectory\",\n", - "}\n", - "\n", - "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", - " criteria=criteria,\n", - " rating_rubric=pointwise_rating_rubric,\n", - " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8MJqXu0kikxd" - }, - "source": [ - "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5EL7iEDMikNQ" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "# Instruction\n", - "You are an expert evaluator. Your task is to evaluate the quality of the responses generated by AI models. We will provide you with the user prompt and an AI-generated responses.\n", - "You should first read the user input carefully for analyzing the task, and then evaluate the quality of the responses based on the Criteria provided in the Evaluation section below.\n", - "You will assign the response a rating following the Rating Rubric and Evaluation Steps. Give step by step explanations for your rating, and only choose ratings from the Rating Rubric.\n", - "\n", - "\n", - "# Evaluation\n", - "## Criteria\n", - "Follows trajectory: Evaluate whether the agent's response logically follows from the sequence of actions it took. Consider these sub-points:\n", - " - Does the response reflect the information gathered during the trajectory?\n", - " - Is the response consistent with the goals and constraints of the task?\n", - " - Are there any unexpected or illogical jumps in reasoning?\n", - "Provide specific examples from the trajectory and response to support your evaluation.\n", - "\n", - "## Rating Rubric\n", - "0: Does not follow trajectory\n", - "1: Follows trajectory\n", - "\n", - "## Evaluation Steps\n", - "Step 1: Assess the response in aspects of all criteria provided. Provide assessment according to each criterion.\n", - "Step 2: Score based on the rating rubric. Give a brief rationale to explain your evaluation considering each individual criterion.\n", - "\n", - "\n", - "# User Inputs and AI-generated Response\n", - "## User Inputs\n", - "### predicted_trajectory\n", - "{predicted_trajectory}\n", - "\n", - "### prompt\n", - "{prompt}\n", - "\n", - "\n", - "\n", - "\n", - "## AI-generated Response\n", - "{response}\n" - ] - } - ], - "source": [ - "print(response_follows_trajectory_prompt_template.prompt_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e1djVp7Fi4Yy" - }, - "source": [ - "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nx1xbZD87iMj" - }, - "outputs": [], - "source": [ - "response_follows_trajectory_metric = PointwiseMetric(\n", - " metric=\"response_follows_trajectory\",\n", - " metric_prompt_template=response_follows_trajectory_prompt_template,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1pmxLwTe7Ywv" - }, - "source": [ - "#### Set response metrics\n", - "\n", - "Set new generated response evaluation metrics by including the custom metric.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wrsbVFDd7Ywv" - }, - "outputs": [], - "source": [ - "response_tool_metrics = [\n", - " \"trajectory_exact_match\",\n", - " \"trajectory_in_order_match\",\n", - " \"safety\",\n", - " response_follows_trajectory_metric,\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Lo-Sza807Ywv" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "Run a new agent's evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_dkb4gSn7Ywv" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/801452371447/locations/us-central1/metadataStores/default/contexts/evaluate-re-agent-response-by-tools-911730d1-06a8-4bde-9eeb-8f66d51217f8 to Experiment: evaluate-re-agent-response-by-tools\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment Run\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 5/5 [00:01<00:00, 2.56it/s]\n", - "INFO:vertexai.preview.evaluation._evaluation:All 5 responses are successfully generated from the runnable.\n", - "INFO:vertexai.preview.evaluation._evaluation:Computing metrics with a total of 20 Vertex Gen AI Evaluation Service API requests.\n", - "100%|██████████| 20/20 [00:21<00:00, 1.08s/it]\n", - "INFO:vertexai.preview.evaluation._evaluation:All 20 metric requests are successfully computed.\n", - "INFO:vertexai.preview.evaluation._evaluation:Evaluation Took:21.68623241199998 seconds\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "### Summary Metrics" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "summary": "{\n \"name\": \"display_eval_report(response_eval_tool_result)\",\n \"rows\": 1,\n \"fields\": [\n {\n \"column\": \"row_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 5.0,\n \"max\": 5.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 5.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_exact_match/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_exact_match/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_in_order_match/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_in_order_match/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.6,\n \"max\": 0.6,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.6\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.5477225575051662,\n \"max\": 0.5477225575051662,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.5477225575051662\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.634030882800016,\n \"max\": 1.634030882800016,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.634030882800016\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.2428521800798761,\n \"max\": 0.2428521800798761,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.2428521800798761\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", - "type": "dataframe" - }, - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
row_counttrajectory_exact_match/meantrajectory_exact_match/stdtrajectory_in_order_match/meantrajectory_in_order_match/stdsafety/meansafety/stdresponse_follows_trajectory/meanresponse_follows_trajectory/stdlatency_in_seconds/meanlatency_in_seconds/stdfailure/meanfailure/std
05.01.00.01.00.01.00.00.60.5477231.6340310.2428520.00.0
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - "
\n" - ], - "text/plain": [ - " row_count trajectory_exact_match/mean trajectory_exact_match/std \\\n", - "0 5.0 1.0 0.0 \n", - "\n", - " trajectory_in_order_match/mean trajectory_in_order_match/std safety/mean \\\n", - "0 1.0 0.0 1.0 \n", - "\n", - " safety/std response_follows_trajectory/mean \\\n", - "0 0.0 0.6 \n", - "\n", - " response_follows_trajectory/std latency_in_seconds/mean \\\n", - "0 0.547723 1.634031 \n", - "\n", - " latency_in_seconds/std failure/mean failure/std \n", - "0 0.242852 0.0 0.0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "### Row-wise Metrics" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "summary": "{\n \"name\": \"display_eval_report(response_eval_tool_result)\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"prompt\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Get product details and price for headphones\",\n \"Get product details for speaker?\",\n \"Get details for usb charger\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"reference_trajectory\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"50\",\n \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n \"A super fast and light usb charger\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 1.3765636650000488,\n \"max\": 1.943170352999914,\n \"num_unique_values\": 5,\n \"samples\": [\n 1.8326374470000246,\n 1.3765636650000488,\n 1.5494367260000672\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 0,\n \"max\": 0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"predicted_trajectory\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_exact_match/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.0,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_in_order_match/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.0,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/explanation\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"The response, \\\"50\\\", does not contain any unsafe content. It is a simple numerical response, likely referring to a price, and doesn't exhibit hate speech, harassment, dangerous instructions, or sexually explicit material.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.0,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/explanation\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"The response \\\"50\\\" does not follow the trajectory. The predicted trajectory includes getting product details and price for headphones. The response only provides a number, which is not enough information. It doesn't say 50 what (dollars? euros? units in stock?). Additionally, we don't get any details as requested. Therefore, the response does not reflect the information that should have been gathered during the trajectory and thus doesn't follow it.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5477225575051662,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", - "type": "dataframe" - }, - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
promptreference_trajectoryresponselatency_in_secondsfailurepredicted_trajectorytrajectory_exact_match/scoretrajectory_in_order_match/scoresafety/explanationsafety/scoreresponse_follows_trajectory/explanationresponse_follows_trajectory/score
0Get price for smartphone[{'tool_name': 'get_product_price', 'tool_inpu...5001.4683460[{'tool_name': 'get_product_price', 'tool_inpu...1.01.0The response is a simple price and does not co...1.0The AI's response follows the trajectory becau...1.0
1Get product details and price for headphones[{'tool_name': 'get_product_details', 'tool_in...501.8326370[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response, \"50\", does not contain any unsaf...1.0The response \"50\" does not follow the trajecto...0.0
2Get details for usb charger[{'tool_name': 'get_product_details', 'tool_in...A super fast and light usb charger1.5494370[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response is safe. It does not contain any ...1.0The response \"A super fast and light usb charg...1.0
3Get product details and price for shoes[{'tool_name': 'get_product_details', 'tool_in...1001.943170[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response is a single number and does not p...1.0The response \"100\" does not follow the traject...0.0
4Get product details for speaker?[{'tool_name': 'get_product_details', 'tool_in...A voice-controlled smart speaker that plays mu...1.3765640[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response is safe. It provides information ...1.0The response provides a high-level description...1.0
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - " \n", - "
\n", - "\n", - "
\n", - "
\n" - ], - "text/plain": [ - " prompt \\\n", - "0 Get price for smartphone \n", - "1 Get product details and price for headphones \n", - "2 Get details for usb charger \n", - "3 Get product details and price for shoes \n", - "4 Get product details for speaker? \n", - "\n", - " reference_trajectory \\\n", - "0 [{'tool_name': 'get_product_price', 'tool_inpu... \n", - "1 [{'tool_name': 'get_product_details', 'tool_in... \n", - "2 [{'tool_name': 'get_product_details', 'tool_in... \n", - "3 [{'tool_name': 'get_product_details', 'tool_in... \n", - "4 [{'tool_name': 'get_product_details', 'tool_in... \n", - "\n", - " response latency_in_seconds \\\n", - "0 500 1.468346 \n", - "1 50 1.832637 \n", - "2 A super fast and light usb charger 1.549437 \n", - "3 100 1.94317 \n", - "4 A voice-controlled smart speaker that plays mu... 1.376564 \n", - "\n", - " failure predicted_trajectory \\\n", - "0 0 [{'tool_name': 'get_product_price', 'tool_inpu... \n", - "1 0 [{'tool_name': 'get_product_details', 'tool_in... \n", - "2 0 [{'tool_name': 'get_product_details', 'tool_in... \n", - "3 0 [{'tool_name': 'get_product_details', 'tool_in... \n", - "4 0 [{'tool_name': 'get_product_details', 'tool_in... \n", - "\n", - " trajectory_exact_match/score trajectory_in_order_match/score \\\n", - "0 1.0 1.0 \n", - "1 1.0 1.0 \n", - "2 1.0 1.0 \n", - "3 1.0 1.0 \n", - "4 1.0 1.0 \n", - "\n", - " safety/explanation safety/score \\\n", - "0 The response is a simple price and does not co... 1.0 \n", - "1 The response, \"50\", does not contain any unsaf... 1.0 \n", - "2 The response is safe. It does not contain any ... 1.0 \n", - "3 The response is a single number and does not p... 1.0 \n", - "4 The response is safe. It provides information ... 1.0 \n", - "\n", - " response_follows_trajectory/explanation \\\n", - "0 The AI's response follows the trajectory becau... \n", - "1 The response \"50\" does not follow the trajecto... \n", - "2 The response \"A super fast and light usb charg... \n", - "3 The response \"100\" does not follow the traject... \n", - "4 The response provides a high-level description... \n", - "\n", - " response_follows_trajectory/score \n", - "0 1.0 \n", - "1 0.0 \n", - "2 1.0 \n", - "3 0.0 \n", - "4 1.0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "EXPERIMENT_NAME = \"evaluate-re-agent-response-by-tools\" # @param {type:\"string\"}\n", - "\n", - "response_eval_tool_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "response_eval_tool_result = response_eval_tool_task.evaluate(\n", - " runnable=agent_parsed_response\n", - ")\n", - "\n", - "display_eval_report(response_eval_tool_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EFmnRBlWqJnC" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "\n", - "Print new evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZODTRuq2lF75" - }, - "outputs": [ - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get price for smartphone
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "500
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Latency In Seconds: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.4945395349998307
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Failure: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Predicted Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "The response is safe, as it does not contain any hate speech, harassment, dangerous content, or sexually explicit content. It simply states the numeric value '500', which can represent the price of a smartphone, making it relevant to the user's prompt.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Coherence/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "STEP 1: The purpose of the prompt is to get the price of a smartphone. The intended audience is anyone looking to purchase a smartphone.\n", - "STEP 2: The response lacks any discernable structure or logical flow, as it's just a numerical value with no context.\n", - "STEP 3: Local coherence is nonexistent because there are no sentences or paragraphs to evaluate transitions.\n", - "STEP 4: The response doesn't utilize any word choices or syntax, hindering the establishment of connections between ideas.\n", - "STEP 5: Pronouns and references are absent in the response.\n", - "Therefore, the response receives the lowest rating due to the lack of any coherent elements.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Coherence/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get product details and price for headphones
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "50
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Latency In Seconds: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.8972680370000035
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Failure: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Predicted Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "The response only contains the number \"50\", which is not harmful or unsafe. Thus, it does not contain any hate speech, harassment, dangerous content, or sexually explicit content. Therefore, the response is safe.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Coherence/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "The response \"50\" lacks coherence entirely. It provides no product details, no currency for the price, and does not specify what the number 50 refers to. The user asked for product details and a price for headphones, but the response offers an isolated number without any context or relevance to the prompt. There is no logical flow, structure, or cohesion. The response does not fulfill the user's request or provide any meaningful information.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Coherence/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get details for usb charger
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "A super fast and light usb charger
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Latency In Seconds: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.5881808110000293
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Failure: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Predicted Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "The response is safe, as it does not contain any hate speech, harassment, dangerous content, or sexually explicit content.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Coherence/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "STEP 1: The prompt seeks detailed information about a USB charger. The intended audience might be anyone looking to purchase or learn more about USB chargers.\n", - "STEP 2: The response lacks a clear structure or progression of ideas. It consists of a single short, general statement, offering no details. There is no beginning, middle, or end, nor does it define 'super fast' or 'light.'\n", - "STEP 3: As the response is a single sentence, there are no transitions to assess, nor is there a paragraph structure to analyze. The sentence fails to provide details about a USB charger.\n", - "STEP 4: The word choice is simplistic and doesn't effectively convey information. Terms like 'super fast' and 'light' are vague and lack specific details.\n", - "STEP 5: There are no pronouns or references to assess.\n", - "Overall, the response is incoherent as it fails to provide the details requested by the prompt and lacks organization and structure.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Coherence/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4nuUDP3a2eTB" - }, - "source": [ - "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", - "\n", - "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pNh3-NDuZGDl" - }, - "source": [ - "### Bring your own evaluation dataset\n", - "\n", - "Define the evaluation dataset with the predicted trajectory and the generated response." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "y9hBgsg324Ej" - }, - "outputs": [], - "source": [ - "byod_eval_data = {\n", - " \"prompt\": [\n", - " \"Get price for smartphone\",\n", - " \"Get product details and price for headphones\",\n", - " \"Get details for usb charger\",\n", - " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\",\n", - " ],\n", - " \"reference_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - " \"generated_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - " \"response\": [\n", - " 500,\n", - " 50,\n", - " \"A super fast and light usb charger\",\n", - " 100,\n", - " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", - " ],\n", - "}\n", - "\n", - "byod_eval_sample_dataset = pd.DataFrame(eval_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oEYmU2eJ7q-1" - }, - "source": [ - "### Run an evaluation task\n", - "\n", - "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wBD-4wpB7q-3" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/801452371447/locations/us-central1/metadataStores/default/contexts/evaluate-re-agent-response-by-tools-response-over-tools-byod-crxo2pye to Experiment: evaluate-re-agent-response-by-tools\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment Run\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 5/5 [00:02<00:00, 1.93it/s]\n", - "INFO:vertexai.preview.evaluation._evaluation:All 5 responses are successfully generated from the runnable.\n", - "INFO:vertexai.preview.evaluation._evaluation:Computing metrics with a total of 20 Vertex Gen AI Evaluation Service API requests.\n", - "100%|██████████| 20/20 [00:22<00:00, 1.12s/it]\n", - "INFO:vertexai.preview.evaluation._evaluation:All 20 metric requests are successfully computed.\n", - "INFO:vertexai.preview.evaluation._evaluation:Evaluation Took:22.457164905000127 seconds\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "### Summary Metrics" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, + "cells": [ { - "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "summary": "{\n \"name\": \"display_eval_report(byod_response_eval_tool_result)\",\n \"rows\": 1,\n \"fields\": [\n {\n \"column\": \"row_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 5.0,\n \"max\": 5.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 5.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_exact_match/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_exact_match/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_in_order_match/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_in_order_match/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.4,\n \"max\": 0.4,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.5477225575051662,\n \"max\": 0.5477225575051662,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.5477225575051662\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.8266308515999754,\n \"max\": 1.8266308515999754,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.8266308515999754\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.4469010777924883,\n \"max\": 0.4469010777924883,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.4469010777924883\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", - "type": "dataframe" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" }, - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
row_counttrajectory_exact_match/meantrajectory_exact_match/stdtrajectory_in_order_match/meantrajectory_in_order_match/stdsafety/meansafety/stdresponse_follows_trajectory/meanresponse_follows_trajectory/stdlatency_in_seconds/meanlatency_in_seconds/stdfailure/meanfailure/std
05.01.00.01.00.01.00.00.40.5477231.8266310.4469010.00.0
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - "
\n" - ], - "text/plain": [ - " row_count trajectory_exact_match/mean trajectory_exact_match/std \\\n", - "0 5.0 1.0 0.0 \n", - "\n", - " trajectory_in_order_match/mean trajectory_in_order_match/std safety/mean \\\n", - "0 1.0 0.0 1.0 \n", - "\n", - " safety/std response_follows_trajectory/mean \\\n", - "0 0.0 0.4 \n", - "\n", - " response_follows_trajectory/std latency_in_seconds/mean \\\n", - "0 0.547723 1.826631 \n", - "\n", - " latency_in_seconds/std failure/mean failure/std \n", - "0 0.446901 0.0 0.0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "### Row-wise Metrics" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "summary": "{\n \"name\": \"display_eval_report(byod_response_eval_tool_result)\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"prompt\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Get product details and price for headphones\",\n \"Get product details for speaker?\",\n \"Get details for usb charger\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"reference_trajectory\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"50\",\n \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n \"A super fast and light usb charger\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 1.41932438799995,\n \"max\": 2.585738198999934,\n \"num_unique_values\": 5,\n \"samples\": [\n 1.7416313100000025,\n 1.41932438799995,\n 2.585738198999934\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 0,\n \"max\": 0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"predicted_trajectory\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_exact_match/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.0,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_in_order_match/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.0,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/explanation\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"The response is a simple \\\"50\\\" which is not harmful or unsafe in any way. It could be interpreted as a price, relevant to the prompt, but lacks detail. There is no hate speech, harassment, dangerous or sexually explicit content.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.0,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/explanation\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"The AI response \\\"50\\\" does not follow the trajectory. The predicted trajectory includes getting product details and price for headphones. Returning only \\\"50\\\" does not reflect the information that should have been gathered. It only provides what could be interpreted as a price, but lacks the product details and context.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5477225575051662,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", - "type": "dataframe" + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" }, - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
promptreference_trajectoryresponselatency_in_secondsfailurepredicted_trajectorytrajectory_exact_match/scoretrajectory_in_order_match/scoresafety/explanationsafety/scoreresponse_follows_trajectory/explanationresponse_follows_trajectory/score
0Get price for smartphone[{'tool_name': 'get_product_price', 'tool_inpu...5001.6097930[{'tool_name': 'get_product_price', 'tool_inpu...1.01.0The response is safe, as it does not contain a...1.0The AI's response follows the trajectory set b...1.0
1Get product details and price for headphones[{'tool_name': 'get_product_details', 'tool_in...501.7416310[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response is a simple \"50\" which is not har...1.0The AI response \"50\" does not follow the traje...0.0
2Get details for usb charger[{'tool_name': 'get_product_details', 'tool_in...A super fast and light usb charger2.5857380[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response is safe, as it does not contain a...1.0The AI's response does not follow the predicte...0.0
3Get product details and price for shoes[{'tool_name': 'get_product_details', 'tool_in...1001.7766670[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response is a number which doesn't promote...1.0The response \"100\" does not follow the traject...0.0
4Get product details for speaker?[{'tool_name': 'get_product_details', 'tool_in...A voice-controlled smart speaker that plays mu...1.4193240[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response is safe, as it does not contain a...1.0The response \"A voice-controlled smart speaker...1.0
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - " \n", - "
\n", - "\n", - "
\n", - "
\n" - ], - "text/plain": [ - " prompt \\\n", - "0 Get price for smartphone \n", - "1 Get product details and price for headphones \n", - "2 Get details for usb charger \n", - "3 Get product details and price for shoes \n", - "4 Get product details for speaker? \n", - "\n", - " reference_trajectory \\\n", - "0 [{'tool_name': 'get_product_price', 'tool_inpu... \n", - "1 [{'tool_name': 'get_product_details', 'tool_in... \n", - "2 [{'tool_name': 'get_product_details', 'tool_in... \n", - "3 [{'tool_name': 'get_product_details', 'tool_in... \n", - "4 [{'tool_name': 'get_product_details', 'tool_in... \n", - "\n", - " response latency_in_seconds \\\n", - "0 500 1.609793 \n", - "1 50 1.741631 \n", - "2 A super fast and light usb charger 2.585738 \n", - "3 100 1.776667 \n", - "4 A voice-controlled smart speaker that plays mu... 1.419324 \n", - "\n", - " failure predicted_trajectory \\\n", - "0 0 [{'tool_name': 'get_product_price', 'tool_inpu... \n", - "1 0 [{'tool_name': 'get_product_details', 'tool_in... \n", - "2 0 [{'tool_name': 'get_product_details', 'tool_in... \n", - "3 0 [{'tool_name': 'get_product_details', 'tool_in... \n", - "4 0 [{'tool_name': 'get_product_details', 'tool_in... \n", - "\n", - " trajectory_exact_match/score trajectory_in_order_match/score \\\n", - "0 1.0 1.0 \n", - "1 1.0 1.0 \n", - "2 1.0 1.0 \n", - "3 1.0 1.0 \n", - "4 1.0 1.0 \n", - "\n", - " safety/explanation safety/score \\\n", - "0 The response is safe, as it does not contain a... 1.0 \n", - "1 The response is a simple \"50\" which is not har... 1.0 \n", - "2 The response is safe, as it does not contain a... 1.0 \n", - "3 The response is a number which doesn't promote... 1.0 \n", - "4 The response is safe, as it does not contain a... 1.0 \n", - "\n", - " response_follows_trajectory/explanation \\\n", - "0 The AI's response follows the trajectory set b... \n", - "1 The AI response \"50\" does not follow the traje... \n", - "2 The AI's response does not follow the predicte... \n", - "3 The response \"100\" does not follow the traject... \n", - "4 The response \"A voice-controlled smart speaker... \n", - "\n", - " response_follows_trajectory/score \n", - "0 1.0 \n", - "1 0.0 \n", - "2 0.0 \n", - "3 0.0 \n", - "4 1.0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", - "\n", - "byod_response_eval_tool_task = EvalTask(\n", - " dataset=byod_eval_sample_dataset,\n", - " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", - " runnable=agent_parsed_response, experiment_run_name=EXPERIMENT_RUN_NAME\n", - ")\n", - "\n", - "display_eval_report(byod_response_eval_tool_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9eU3LG6r7q-3" - }, - "source": [ - "### Visualize evaluation results\n", - "\n", - "Visualize evaluation result sample.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pQFzmd2I7q-3" - }, - "outputs": [ - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get price for smartphone
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "500
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Latency In Seconds: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.6097934590000023
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Failure: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Predicted Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Exact Match/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory In Order Match/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "The response is safe, as it does not contain any hate speech, harassment, dangerous content, or sexually explicit content.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response Follows Trajectory/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "The AI's response follows the trajectory set by the predicted trajectory. The trajectory indicates that the AI should use the \"get_product_price\" tool with \"smartphone\" as input. Based on the response \"500\", we can infer that the AI successfully executed this action and returned the price. Thus, the response directly reflects the information gathered during the trajectory by using the specified tool and input. The response is consistent with the user's prompt to \"Get price for smartphone\" as it provides a numerical value which can be interpreted as a price. There are no unexpected jumps in reasoning, making the response logical and relevant. Therefore, a rating of \"1\" is assigned, indicating that the AI's response follows the trajectory.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response Follows Trajectory/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" + "source": [ + "# Evaluate an LangGraph agent on Vertex AI Reasoning Engine (Customized template)\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Google
Open in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Open in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "Share to:\n", + "\n", + "\n", + " \"LinkedIn\n", + "\n", + "\n", + "\n", + " \"Bluesky\n", + "\n", + "\n", + "\n", + " \"X\n", + "\n", + "\n", + "\n", + " \"Reddit\n", + "\n", + "\n", + "\n", + " \"Facebook\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "84f0f73a0f76" + }, + "source": [ + "| | |\n", + "|-|-|\n", + "| Authors | [Naveksha Sood](https://github.com/navekshasood) [Ivan Nardini](https://github.com/inardini) |" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", + "\n", + "This tutorial shows how to evaluate an LangGraph agent with customized template on Vertex AI Reasoning Engine using Vertex AI Gen AI Evaluation.\n", + "\n", + "The tutorial uses the following Google Cloud services and resources:\n", + "\n", + "* Vertex AI Gen AI Evaluation\n", + "* Vertex AI Reasoning Engine\n", + "\n", + "The steps performed include:\n", + "\n", + "* Build and deploy an agent using LangGraph on Vertex AI Reasoning Engine\n", + "* Prepare Agent Evaluation dataset\n", + "* Single tool usage evaluation\n", + "* Trajectory evaluation\n", + "* Response evaluation\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "61RBz8LLbxCR" + }, + "source": [ + "## Get started" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" + "cell_type": "markdown", + "metadata": { + "id": "No17Cw5hgx12" + }, + "source": [ + "### Install Vertex AI SDK and other required packages\n" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "Get product details and price for headphones
" - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tFy3H3aPgx12" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.0/42.0 kB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.3/67.3 kB\u001b[0m \u001b[31m4.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m48.5/48.5 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m192.0/192.0 kB\u001b[0m \u001b[31m7.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m468.0/468.0 kB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m131.8/131.8 kB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m628.3/628.3 kB\u001b[0m \u001b[31m15.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m22.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m147.8/147.8 kB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m211.4/211.4 kB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.1/53.1 kB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m29.9/29.9 MB\u001b[0m \u001b[31m31.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m33.5/33.5 MB\u001b[0m \u001b[31m14.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.5/6.5 MB\u001b[0m \u001b[31m72.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.8/55.8 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.2/59.2 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m83.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m319.7/319.7 kB\u001b[0m \u001b[31m19.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.6/57.6 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m756.0/756.0 kB\u001b[0m \u001b[31m38.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.7/9.7 MB\u001b[0m \u001b[31m76.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.0/15.0 MB\u001b[0m \u001b[31m55.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m233.5/233.5 kB\u001b[0m \u001b[31m13.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m278.6/278.6 kB\u001b[0m \u001b[31m16.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m249.9/249.9 kB\u001b[0m \u001b[31m14.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m94.8/94.8 kB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m131.6/131.6 kB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m327.6/327.6 kB\u001b[0m \u001b[31m17.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.9/1.9 MB\u001b[0m \u001b[31m63.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.3/44.3 kB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.5/2.5 MB\u001b[0m \u001b[31m62.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.7/50.7 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m311.8/311.8 kB\u001b[0m \u001b[31m18.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m83.2/83.2 kB\u001b[0m \u001b[31m5.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m93.2/93.2 kB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.3/13.3 MB\u001b[0m \u001b[31m51.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.8/54.8 kB\u001b[0m \u001b[31m3.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m298.0/298.0 kB\u001b[0m \u001b[31m16.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.8/2.8 MB\u001b[0m \u001b[31m56.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m71.1/71.1 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m41.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.0/3.0 MB\u001b[0m \u001b[31m65.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m481.7/481.7 kB\u001b[0m \u001b[31m23.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.3/62.3 kB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m56.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m442.1/442.1 kB\u001b[0m \u001b[31m21.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m42.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m209.0/209.0 kB\u001b[0m \u001b[31m13.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m267.2/267.2 kB\u001b[0m \u001b[31m14.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m73.2/73.2 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.8/3.8 MB\u001b[0m \u001b[31m57.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m443.8/443.8 kB\u001b[0m \u001b[31m26.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.6/78.6 kB\u001b[0m \u001b[31m5.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m49.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m6.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.5/49.5 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.5/57.5 kB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for docx2txt (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Building wheel for pypika (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[33m WARNING: The script uvicorn is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script pytube is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script dotenv is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script pypdfium2 is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script nodeenv is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script mako-render is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script json_repair is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script humanfriendly is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script fastavro is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script watchfiles is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The scripts pyright, pyright-langserver, pyright-python and pyright-python-langserver are installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script gptcache_server is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script coloredlogs is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script pyproject-build is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script alembic is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script onnxruntime_test is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script langsmith is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script fastapi is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script pdfplumber is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script litellm is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script instructor is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script chroma is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script ec is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script crewai is installed in '/root/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "tensorflow 2.17.1 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 5.29.1 which is incompatible.\n", + "tensorflow-metadata 1.13.1 requires protobuf<5,>=3.20.3, but you have protobuf 5.29.1 which is incompatible.\n", + "transformers 4.47.0 requires tokenizers<0.22,>=0.21, but you have tokenizers 0.20.3 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation, langchain, reasoningengine]\" \\\n", + " \"langchain_google_vertexai\" \\\n", + " \"langgraph\" \\\n", + " \"cloudpickle==3.0.0\" \\\n", + " \"pydantic==2.7.4\" \\\n", + " \"requests\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R5Xep4W9lq-Z" + }, + "source": [ + "### Restart runtime\n", + "\n", + "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", + "\n", + "The restart might take a minute or longer. After it's restarted, continue to the next step." ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XRvKdaPDTznN" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'status': 'ok', 'restart': True}" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import IPython\n", + "\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SbmM4z7FOBpM" + }, + "source": [ + "
\n", + "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", + "
\n" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" - ], - "text/plain": [ - "" + "cell_type": "markdown", + "metadata": { + "id": "dmWOrTJ3gx13" + }, + "source": [ + "### Authenticate your notebook environment (Colab only)\n", + "\n", + "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "Response: " - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NyKGtVQjgx13" + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " from google.colab import auth\n", + "\n", + " auth.authenticate_user()" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "50
" - ], - "text/plain": [ - "" + "cell_type": "markdown", + "metadata": { + "id": "DF4l8DTdWgPY" + }, + "source": [ + "### Set Google Cloud project information and initialize Vertex AI SDK\n", + "\n", + "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "Latency In Seconds: " - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nqwi-5ufWp_B" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating gs://evaluate_agents/...\n", + "ServiceException: 409 A Cloud Storage bucket named 'evaluate_agents' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Use the environment variable if the user doesn't provide Project ID.\n", + "import os\n", + "\n", + "import vertexai\n", + "\n", + "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", + "\n", + "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", + " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", + "\n", + "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", + "\n", + "BUCKET_NAME = \"[your-bucket-name]\" # @param {type: \"string\", placeholder: \"[your-bucket-name]\", isTemplate: true}\n", + "\n", + "if not BUCKET_NAME or BUCKET_NAME == \"[your-bucket-name]\":\n", + " BUCKET_NAME = f\"{PROJECT_ID}-bucket\"\n", + "\n", + "BUCKET_URI = f\"gs://evaluate_agents\"\n", + "\n", + "! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI\n", + "\n", + "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", + "\n", + "vertexai.init(\n", + " project=PROJECT_ID,\n", + " location=LOCATION,\n", + " staging_bucket=BUCKET_URI,\n", + " experiment=EXPERIMENT_NAME,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5303c05f7aa6" + }, + "source": [ + "## Import libraries\n", + "\n", + "Import tutorial libraries." ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "1.7416313100000025
" - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6fc324893334" + }, + "outputs": [], + "source": [ + "# General\n", + "import random\n", + "import string\n", + "from typing import Literal\n", + "\n", + "from IPython.display import HTML, Markdown, display\n", + "\n", + "# Build agent\n", + "from crewai_tools import tool\n", + "\n", + "# Evaluate agent\n", + "from google.cloud import aiplatform\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "from vertexai.preview import reasoning_engines\n", + "from vertexai.preview.evaluation import EvalTask\n", + "from vertexai.preview.evaluation.metrics import (\n", + " PointwiseMetric,\n", + " PointwiseMetricPromptTemplate,\n", + " TrajectorySingleToolUse,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MVnBDX54gz7j" + }, + "source": [ + "## Define helper functions\n", + "\n", + "Initiate a set of helper functions to print tutorial results." ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "Failure: " - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uSgWjMD_g1_v" + }, + "outputs": [], + "source": [ + "def get_id(length: int = 8) -> str:\n", + " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", + " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", + "\n", + "\n", + "def parse_crewai_output_to_dictionary(crew, crew_output):\n", + " \"\"\"\n", + " Parse CrewAI output into a structured dictionary format.\n", + " \"\"\"\n", + " final_output = {\"response\": str(crew_output), \"predicted_trajectory\": []}\n", + "\n", + " try:\n", + " # Access tools_results directly from each agent\n", + " for agent in crew.agents:\n", + " if hasattr(agent, \"tools_results\"):\n", + " for tool_result in agent.tools_results:\n", + " tool_info = {\n", + " \"tool_name\": tool_result.get(\"tool_name\", \"\"),\n", + " \"tool_input\": tool_result.get(\"tool_args\", {}),\n", + " }\n", + " final_output[\"predicted_trajectory\"].append(tool_info)\n", + "\n", + " except Exception as e:\n", + " final_output[\"error\"] = f\"Error parsing tools results: {str(e)}\"\n", + "\n", + " return final_output\n", + "\n", + "\n", + "def format_output_as_markdown(output: dict) -> str:\n", + " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", + " markdown = \"### AI Response\\n\"\n", + " markdown += f\"{output['response']}\\n\\n\"\n", + "\n", + " if output[\"predicted_trajectory\"]:\n", + " markdown += \"### Function Calls\\n\"\n", + " for call in output[\"predicted_trajectory\"]:\n", + " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", + " markdown += \" - **Arguments**:\\n\"\n", + " for key, value in call[\"tool_input\"].items():\n", + " markdown += f\" - `{key}`: `{value}`\\n\"\n", + "\n", + " return markdown\n", + "\n", + "\n", + "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", + " \"\"\"Display the evaluation results.\"\"\"\n", + " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", + " display(Markdown(\"### Summary Metrics\"))\n", + " display(metrics_df)\n", + "\n", + " display(Markdown(f\"### Row-wise Metrics\"))\n", + " display(eval_result.metrics_table)\n", + "\n", + "\n", + "def display_drilldown(row: pd.Series) -> None:\n", + " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", + "\n", + " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", + "\n", + " if not (\n", + " isinstance(row[\"predicted_trajectory\"], list)\n", + " and isinstance(row[\"reference_trajectory\"], list)\n", + " ):\n", + " return\n", + "\n", + " for predicted_trajectory, reference_trajectory in zip(\n", + " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", + " ):\n", + " display(\n", + " HTML(\n", + " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", + " )\n", + " )\n", + "\n", + " if not (\n", + " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", + " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", + " ):\n", + " continue\n", + "\n", + " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", + " print(\"Tool Input Key: \", tool_input_key)\n", + "\n", + " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " reference_trajectory[\"tool_input\"][tool_input_key],\n", + " )\n", + " else:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " \"N/A\",\n", + " )\n", + " print(\"\\n\")\n", + " display(HTML(\"
\"))\n", + "\n", + "\n", + "def display_dataframe_rows(\n", + " df: pd.DataFrame,\n", + " columns: list[str] | None = None,\n", + " num_rows: int = 3,\n", + " display_drilldown: bool = False,\n", + ") -> None:\n", + " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", + "\n", + " if columns:\n", + " df = df[columns]\n", + "\n", + " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", + " header_style = base_style + \"font-weight: bold;\"\n", + "\n", + " for _, row in df.head(num_rows).iterrows():\n", + " for column in df.columns:\n", + " display(\n", + " HTML(\n", + " f\"{column.replace('_', ' ').title()}: \"\n", + " )\n", + " )\n", + " display(HTML(f\"{row[column]}
\"))\n", + "\n", + " display(HTML(\"
\"))\n", + "\n", + " if (\n", + " display_drilldown\n", + " and \"predicted_trajectory\" in df.columns\n", + " and \"reference_trajectory\" in df.columns\n", + " ):\n", + " display_drilldown(row)\n", + "\n", + "\n", + "def plot_bar_plot(\n", + " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", + ") -> None:\n", + " fig = go.Figure()\n", + " data = []\n", + "\n", + " summary_metrics = eval_result.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " data.append(\n", + " go.Bar(\n", + " x=list(summary_metrics.keys()),\n", + " y=list(summary_metrics.values()),\n", + " name=title,\n", + " )\n", + " )\n", + "\n", + " fig = go.Figure(data=data)\n", + "\n", + " # Change the bar mode\n", + " fig.update_layout(barmode=\"group\")\n", + " fig.show()\n", + "\n", + "\n", + "def display_radar_plot(eval_results, title: str, metrics=None):\n", + " \"\"\"Plot the radar plot.\"\"\"\n", + " fig = go.Figure()\n", + " summary_metrics = eval_results.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " min_val = min(summary_metrics.values())\n", + " max_val = max(summary_metrics.values())\n", + "\n", + " fig.add_trace(\n", + " go.Scatterpolar(\n", + " r=list(summary_metrics.values()),\n", + " theta=list(summary_metrics.keys()),\n", + " fill=\"toself\",\n", + " name=title,\n", + " )\n", + " )\n", + " fig.update_layout(\n", + " title=title,\n", + " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", + " showlegend=True,\n", + " )\n", + " fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bDaa2Mtsifmq" + }, + "source": [ + "## Build an agent using Vertex AI Reasoning Engine's customized template\n", + "\n", + "Build and deploy your application using LangGraph, including the Gemini model and custom tools that you define.\n" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "0
" - ], - "text/plain": [ - "" + "cell_type": "markdown", + "metadata": { + "id": "KHwShhpOitKp" + }, + "source": [ + "### Set tools\n", + "\n", + "To start, set the tools that a customer support agent needs to do their job." ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "Predicted Trajectory: " - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gA2ZKvfeislw" + }, + "outputs": [], + "source": [ + "@tool\n", + "def get_product_details(product_name: str):\n", + " \"\"\"Gathers basic details about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", + " \"usb charger\": \"A super fast and light usb charger\",\n", + " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", + " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", + " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " }\n", + " return details.get(product_name, \"Product details not found.\")\n", + "\n", + "\n", + "@tool\n", + "def get_product_price(product_name: str):\n", + " \"\"\"Gathers price about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": 500,\n", + " \"usb charger\": 10,\n", + " \"shoes\": 100,\n", + " \"headphones\": 50,\n", + " \"speaker\": 80,\n", + " }\n", + " return details.get(product_name, \"Product price not found.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "be70714d9fae" + }, + "source": [ + "### Define router\n", + "\n", + "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "516b5108d327" + }, + "outputs": [], + "source": [ + "def router(\n", + " state: list[BaseMessage],\n", + ") -> Literal[\"get_product_details\", \"get_product_price\", \"__end__\"]:\n", + " \"\"\"Initiates product details or price retrieval if the user asks for a product.\"\"\"\n", + " # Get the tool_calls from the last message in the conversation history.\n", + " tool_calls = state[-1].tool_calls\n", + "\n", + " # If there are any tool_calls\n", + " if tool_calls:\n", + " # Check the function name in the first tool call\n", + " function_name = tool_calls[0].get(\"name\")\n", + " if function_name == \"get_product_price\":\n", + " return \"get_product_price\"\n", + " else:\n", + " return \"get_product_details\"\n", + " else:\n", + " # End the conversation flow.\n", + " return \"__end__\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FHjhBVx2cHWb" + }, + "source": [ + "### Set the model\n", + "\n", + "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "Trajectory Exact Match/Score: " - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iCx9hbpccHWc" + }, + "outputs": [], + "source": [ + "model = \"gemini-1.5-pro\"" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" + "cell_type": "markdown", + "metadata": { + "id": "tNlAY9cojEWz" + }, + "source": [ + "### Assemble the agent\n", + "\n", + "To create a LangGraph agent using [Vertex AI Reasoning Engine](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy), use the [customized template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize).\n", + "\n", + "This class helps you quickly get an agent using any framework running Vertex AI Reasoning Engine.\n", + "\n", + "To learn more about the template, check out [Customize an application template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize) documentation." ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "Trajectory In Order Match/Score: " - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dAFdi7SujGP8" + }, + "outputs": [], + "source": [ + "class LangGraphApp:\n", + " def __init__(self, project: str, location: str, model: str = model) -> None:\n", + " self.project_id = project\n", + " self.location = location\n", + " self.model = model\n", + "\n", + " # The set_up method is used to define application initialization logic\n", + " def set_up(self) -> None:\n", + " model = ChatVertexAI(model=self.model)\n", + " builder = MessageGraph()\n", + "\n", + " model_with_tools = model.bind_tools([get_product_details, get_product_price])\n", + " builder.add_node(\"tools\", model_with_tools)\n", + "\n", + " tool_node = ToolNode([get_product_details, get_product_price])\n", + " builder.add_node(\"get_product_details\", tool_node)\n", + " builder.add_node(\"get_product_price\", tool_node)\n", + " builder.add_edge(\"get_product_details\", END)\n", + " builder.add_edge(\"get_product_price\", END)\n", + "\n", + " builder.set_entry_point(\"tools\")\n", + " builder.add_conditional_edges(\"tools\", router)\n", + " self.app = builder.compile()\n", + "\n", + " # The query method will be used to send inputs to the agent\n", + " def query(self, input: str):\n", + " \"\"\"Query the application.\"\"\"\n", + " chat_history = langchain_load_dump.dumpd(self.app.invoke(HumanMessage(input)))\n", + " return chat_history\n", + " # return {'output': parse_messages_to_output_dictionary(chat_history)}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_HGcs6PVjRj_" + }, + "source": [ + "### Test the local agent\n", + "\n", + "Query your agent." ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1dXLLgBudu_L" + }, + "outputs": [], + "source": [ + "local_custom_agent = LangGraphApp(project=PROJECT_ID, location=LOCATION)\n", + "local_custom_agent.set_up()" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "Safety/Explanation: " - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PgkOhPmN3aCZ" + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "### AI Response\n", + "High-performance running shoes designed for comfort, support, and speed.\n", + "\n", + "### Function Calls\n", + "- **Function**: `get_product_details`\n", + " - **Arguments**:\n", + " - `product_name`: `shoes`\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "response = local_custom_agent.query(input=\"Get product details for shoes\")\n", + "display(\n", + " Markdown(format_output_as_markdown(parse_messages_to_output_dictionary(response)))\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lGb58OJkjUs9" + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "### AI Response\n", + "100\n", + "\n", + "### Function Calls\n", + "- **Function**: `get_product_price`\n", + " - **Arguments**:\n", + " - `product_name`: `shoes`\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "response = local_custom_agent.query(input=\"Get product price for shoes\")\n", + "display(\n", + " Markdown(format_output_as_markdown(parse_messages_to_output_dictionary(response)))\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2pSItXD5e4QD" + }, + "source": [ + "### Deploy the local agent to Vertex AI Reasoning Engine\n", + "\n", + "To deploy the local agent on Vertex AI Reasoning Engine, you can use the `create` method by passing the agent and some specify dependencies (`requirements` for external PyPI packages and `extra_packages` for local packages ).\n", + "\n", + "Look at [Deploy the application](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy#create_a_reasoningengine_instance) documentation page to learn more. \n", + "\n", + "> The agent deployment on Vertex AI Reasoning Engine would require ~ 10 mins." ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "The response is a simple \"50\" which is not harmful or unsafe in any way. It could be interpreted as a price, relevant to the prompt, but lacks detail. There is no hate speech, harassment, dangerous or sexually explicit content.
" - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3HLz_a1We4QE" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:vertexai.reasoning_engines._reasoning_engines:Using bucket evaluate_agents\n", + "INFO:vertexai.reasoning_engines._reasoning_engines:Writing to gs://evaluate_agents/reasoning_engine/reasoning_engine.pkl\n", + "INFO:vertexai.reasoning_engines._reasoning_engines:Writing to gs://evaluate_agents/reasoning_engine/requirements.txt\n", + "INFO:vertexai.reasoning_engines._reasoning_engines:Creating in-memory tarfile of extra_packages\n", + "INFO:vertexai.reasoning_engines._reasoning_engines:Writing to gs://evaluate_agents/reasoning_engine/dependencies.tar.gz\n", + "INFO:vertexai.reasoning_engines._reasoning_engines:Creating ReasoningEngine\n", + "INFO:vertexai.reasoning_engines._reasoning_engines:Create ReasoningEngine backing LRO: projects/801452371447/locations/us-central1/reasoningEngines/1480048204102762496/operations/5878089664325222400\n", + "INFO:vertexai.reasoning_engines._reasoning_engines:ReasoningEngine created. Resource name: projects/801452371447/locations/us-central1/reasoningEngines/1480048204102762496\n", + "INFO:vertexai.reasoning_engines._reasoning_engines:To use this ReasoningEngine in another session:\n", + "INFO:vertexai.reasoning_engines._reasoning_engines:reasoning_engine = vertexai.preview.reasoning_engines.ReasoningEngine('projects/801452371447/locations/us-central1/reasoningEngines/1480048204102762496')\n" + ] + } + ], + "source": [ + "local_custom_agent = LangGraphApp(project=PROJECT_ID, location=LOCATION)\n", + "\n", + "remote_custom_agent = reasoning_engines.ReasoningEngine.create(\n", + " local_custom_agent,\n", + " requirements=[\n", + " \"google-cloud-aiplatform[langchain,reasoningengine]\",\n", + " \"langchain_google_vertexai\",\n", + " \"langgraph\",\n", + " \"cloudpickle==3.0.0\",\n", + " \"pydantic==2.7.4\",\n", + " \"requests\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nu4RO1P9e4QE" + }, + "source": [ + "### Test the remote agent\n", + "\n", + "Query your remote agent." ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "Safety/Score: " - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sqBtzYyce4QE" + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "### AI Response\n", + "High-performance running shoes designed for comfort, support, and speed.\n", + "\n", + "### Function Calls\n", + "- **Function**: `get_product_details`\n", + " - **Arguments**:\n", + " - `product_name`: `shoes`\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "response = remote_custom_agent.query(input=\"Get product details for shoes\")\n", + "display(\n", + " Markdown(format_output_as_markdown(parse_messages_to_output_dictionary(response)))\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aOGPePsorpUl" + }, + "source": [ + "## Evaluating a custom agent with Vertex AI Gen AI Evaluation\n", + "\n", + "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", + "\n", + "Monitoring focuses on how well your agent is performing specific tasks:\n", + "\n", + "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", + "\n", + "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", + "\n", + "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", + "\n", + "Observability is about understanding the overall health of the agent:\n", + "\n", + "* **Latency**: How long does it take the agent to respond?\n", + "\n", + "* **Failure Rate**: How often does the agent fail to produce a response?\n", + "\n", + "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e43229f3ad4f" + }, + "source": [ + "### Prepare Agent Evaluation dataset\n", + "\n", + "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", + "\n", + "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", + "\n", + "\n", + "> Optionally, you can provide both generated responses and predicted trajectory (**bring-your-own-dataset scenario**).\n", + "\n", + "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fFf8uTdUiDt3" + }, + "outputs": [], + "source": [ + "eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + "}\n", + "\n", + "eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PQEI1EcfvFHb" + }, + "source": [ + "Print some samples from the dataset." ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EjsonqWWvIvE" + }, + "outputs": [ + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get price for smartphone
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get product details and price for headphones
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get details for usb charger
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display_dataframe_rows(eval_sample_dataset, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "htCrOS9fRVi8" + }, + "source": [ + "### Prepare an Agent function\n", + "\n", + "In this scenario with a custom agent, you need an agent function to parse the agent output and pass it to Vertex AI Gen AI Evaluation." ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "Response Follows Trajectory/Explanation: " - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GdO56MIDRZri" + }, + "outputs": [], + "source": [ + "def agent_parsed_response(input: str) -> dict:\n", + " \"\"\"Parse the agent output and pass it to Vertex AI Gen AI Evaluation.\"\"\"\n", + "\n", + " result = remote_custom_agent.query(input=input)\n", + "\n", + " # Parse function calls separately\n", + " agent_output = parse_messages_to_output_dictionary(result)\n", + "\n", + " return agent_output" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m4CvBuf1afHG" + }, + "source": [ + "### Single tool usage evaluation\n", + "\n", + "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "The AI response \"50\" does not follow the trajectory. The predicted trajectory includes getting product details and price for headphones. Returning only \"50\" does not reflect the information that should have been gathered. It only provides what could be interpreted as a price, but lacks the product details and context.
" - ], - "text/plain": [ - "" + "cell_type": "markdown", + "metadata": { + "id": "_rS5GGKHd5bx" + }, + "source": [ + "#### Set single tool usage metrics\n", + "\n", + "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", + "\n", + "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "Response Follows Trajectory/Score: " - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xixvq8dwd5by" + }, + "outputs": [], + "source": [ + "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "0.0
" - ], - "text/plain": [ - "" + "cell_type": "markdown", + "metadata": { + "id": "ktKZoT2Qd5by" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using a local agent (local_1p_agent) and assigns a unique identifier to this specific evaluation run, storing the evaluation results.\n" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QaMf9dqzySE6" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SRv43fDcd5by" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/801452371447/locations/us-central1/metadataStores/default/contexts/evaluate-agent-single-metric-eval-s58mdw1j to Experiment: evaluate-agent\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment Run\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 5/5 [00:02<00:00, 1.81it/s]\n", + "INFO:vertexai.preview.evaluation._evaluation:All 5 responses are successfully generated from the runnable.\n", + "INFO:vertexai.preview.evaluation._evaluation:Computing metrics with a total of 5 Vertex Gen AI Evaluation Service API requests.\n", + "100%|██████████| 5/5 [00:04<00:00, 1.23it/s]\n", + "INFO:vertexai.preview.evaluation._evaluation:All 5 metric requests are successfully computed.\n", + "INFO:vertexai.preview.evaluation._evaluation:Evaluation Took:4.098520709000013 seconds\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### Summary Metrics" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"display_eval_report(single_tool_call_eval_result)\",\n \"rows\": 1,\n \"fields\": [\n {\n \"column\": \"row_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 5.0,\n \"max\": 5.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 5.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_single_tool_use/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.6,\n \"max\": 0.6,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.6\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_single_tool_use/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.5477225575051662,\n \"max\": 0.5477225575051662,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.5477225575051662\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 2.1747785195999767,\n \"max\": 2.1747785195999767,\n \"num_unique_values\": 1,\n \"samples\": [\n 2.1747785195999767\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.5584294262336947,\n \"max\": 0.5584294262336947,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.5584294262336947\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
row_counttrajectory_single_tool_use/meantrajectory_single_tool_use/stdlatency_in_seconds/meanlatency_in_seconds/stdfailure/meanfailure/std
05.00.60.5477232.1747790.5584290.00.0
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " row_count trajectory_single_tool_use/mean trajectory_single_tool_use/std \\\n", + "0 5.0 0.6 0.547723 \n", + "\n", + " latency_in_seconds/mean latency_in_seconds/std failure/mean failure/std \n", + "0 2.174779 0.558429 0.0 0.0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### Row-wise Metrics" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"display_eval_report(single_tool_call_eval_result)\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"prompt\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Get product details and price for headphones\",\n \"Get product details for speaker?\",\n \"Get details for usb charger\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"reference_trajectory\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"50\",\n \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n \"A super fast and light usb charger\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 1.4841679319999912,\n \"max\": 2.7480303170000298,\n \"num_unique_values\": 5,\n \"samples\": [\n 2.7480303170000298,\n 2.3126841799999056,\n 2.624170197000012\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 0,\n \"max\": 0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"predicted_trajectory\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_single_tool_use/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5477225575051662,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
promptreference_trajectoryresponselatency_in_secondsfailurepredicted_trajectorytrajectory_single_tool_use/score
0Get price for smartphone[{'tool_name': 'get_product_price', 'tool_inpu...5001.4841680[{'tool_name': 'get_product_price', 'tool_inpu...1.0
1Get product details and price for headphones[{'tool_name': 'get_product_details', 'tool_in...502.748030[{'tool_name': 'get_product_details', 'tool_in...1.0
2Get details for usb charger[{'tool_name': 'get_product_details', 'tool_in...A super fast and light usb charger2.624170[{'tool_name': 'get_product_details', 'tool_in...0.0
3Get product details and price for shoes[{'tool_name': 'get_product_details', 'tool_in...1001.704840[{'tool_name': 'get_product_details', 'tool_in...1.0
4Get product details for speaker?[{'tool_name': 'get_product_details', 'tool_in...A voice-controlled smart speaker that plays mu...2.3126840[{'tool_name': 'get_product_details', 'tool_in...0.0
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " prompt \\\n", + "0 Get price for smartphone \n", + "1 Get product details and price for headphones \n", + "2 Get details for usb charger \n", + "3 Get product details and price for shoes \n", + "4 Get product details for speaker? \n", + "\n", + " reference_trajectory \\\n", + "0 [{'tool_name': 'get_product_price', 'tool_inpu... \n", + "1 [{'tool_name': 'get_product_details', 'tool_in... \n", + "2 [{'tool_name': 'get_product_details', 'tool_in... \n", + "3 [{'tool_name': 'get_product_details', 'tool_in... \n", + "4 [{'tool_name': 'get_product_details', 'tool_in... \n", + "\n", + " response latency_in_seconds \\\n", + "0 500 1.484168 \n", + "1 50 2.74803 \n", + "2 A super fast and light usb charger 2.62417 \n", + "3 100 1.70484 \n", + "4 A voice-controlled smart speaker that plays mu... 2.312684 \n", + "\n", + " failure predicted_trajectory \\\n", + "0 0 [{'tool_name': 'get_product_price', 'tool_inpu... \n", + "1 0 [{'tool_name': 'get_product_details', 'tool_in... \n", + "2 0 [{'tool_name': 'get_product_details', 'tool_in... \n", + "3 0 [{'tool_name': 'get_product_details', 'tool_in... \n", + "4 0 [{'tool_name': 'get_product_details', 'tool_in... \n", + "\n", + " trajectory_single_tool_use/score \n", + "0 1.0 \n", + "1 1.0 \n", + "2 0.0 \n", + "3 1.0 \n", + "4 0.0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "single_tool_call_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=single_tool_usage_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", + " runnable=agent_parsed_response, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(single_tool_call_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6o5BjSTFKVMS" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Use some helper functions to visualize a sample of evaluation result." ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "Get details for usb charger
" - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZkpwPReipekr" + }, + "outputs": [ + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get price for smartphone
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "500
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Latency In Seconds: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.4841679319999912
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Failure: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Predicted Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Single Tool Use/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get product details and price for headphones
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "50
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Latency In Seconds: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "2.7480303170000298
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Failure: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Predicted Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Single Tool Use/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get details for usb charger
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "A super fast and light usb charger
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Latency In Seconds: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "2.624170197000012
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Failure: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Predicted Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Single Tool Use/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JlujdJpu5Kn6" + }, + "source": [ + "### Trajectory Evaluation\n", + "\n", + "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" + "cell_type": "markdown", + "metadata": { + "id": "8s-nHdDJneHM" + }, + "source": [ + "#### Set trajectory metrics\n", + "\n", + "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", + "\n", + "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", + "\n", + "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", + "\n", + "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", + "\n", + "* `trajectory_precision`: proportion of predicted actions present in reference\n", + "\n", + "* `trajectory_recall`: proportion of reference actions present in predicted. \n", + "\n", + "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c32WIS95neHN" + }, + "outputs": [], + "source": [ + "trajectory_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"trajectory_any_order_match\",\n", + " \"trajectory_precision\",\n", + " \"trajectory_recall\",\n", + "]" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" - ], - "text/plain": [ - "" + "cell_type": "markdown", + "metadata": { + "id": "DF3jhTH3neHN" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "Response: " - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vOdS7TJUneHN" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/801452371447/locations/us-central1/metadataStores/default/contexts/evaluate-re-agent-trajectory-3b77ede9-8ae8-416b-9fdf-50bab4b99297 to Experiment: evaluate-re-agent-trajectory\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment Run\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 5/5 [00:02<00:00, 1.90it/s]\n", + "INFO:vertexai.preview.evaluation._evaluation:All 5 responses are successfully generated from the runnable.\n", + "INFO:vertexai.preview.evaluation._evaluation:Computing metrics with a total of 25 Vertex Gen AI Evaluation Service API requests.\n", + "100%|██████████| 25/25 [00:24<00:00, 1.04it/s]\n", + "INFO:vertexai.preview.evaluation._evaluation:All 25 metric requests are successfully computed.\n", + "INFO:vertexai.preview.evaluation._evaluation:Evaluation Took:24.113868357 seconds\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "EXPERIMENT_NAME = \"evaluate-re-agent-trajectory\" # @param {type:\"string\"}\n", + "\n", + "trajectory_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=agent_parsed_response)\n", + "\n", + "display_eval_report(trajectory_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DBiUI3LyLBtj" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Print and visualize a sample of evaluation results." ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "A super fast and light usb charger
" - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sLVRdN5llA0h" + }, + "outputs": [ + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get price for smartphone
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "500
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Latency In Seconds: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "2.57008658299992
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Failure: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Predicted Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Exact Match/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory In Order Match/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Any Order Match/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Precision/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Recall/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get product details and price for headphones
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "50
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Latency In Seconds: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.7254483579999942
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Failure: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Predicted Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Exact Match/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory In Order Match/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Any Order Match/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Precision/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Recall/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get details for usb charger
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "A super fast and light usb charger
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Latency In Seconds: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "2.6286665519999133
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Failure: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Predicted Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Exact Match/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory In Order Match/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Any Order Match/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Precision/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Recall/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PrxM5sMZYXHP" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plot_bar_plot(\n", + " trajectory_eval_result,\n", + " title=\"Trajectory Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T8TipU2akHEd" + }, + "source": [ + "### Evaluate final response\n", + "\n", + "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "Latency In Seconds: " - ], - "text/plain": [ - "" + "cell_type": "markdown", + "metadata": { + "id": "DeK-py7ykkDN" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", + "\n", + "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "2.585738198999934
" - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cyGHGgeVklvz" + }, + "outputs": [], + "source": [ + "response_metrics = [\"safety\", \"coherence\"]" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "Failure: " - ], - "text/plain": [ - "" + "cell_type": "markdown", + "metadata": { + "id": "DaBJWcg1kn55" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "0
" - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wRb2EC_hknSD" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/801452371447/locations/us-central1/metadataStores/default/contexts/evaluate-re-agent-response-350dc51f-c862-4661-a311-910720d88957 to Experiment: evaluate-re-agent-response\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment Run\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 5/5 [00:01<00:00, 2.63it/s]\n", + "INFO:vertexai.preview.evaluation._evaluation:All 5 responses are successfully generated from the runnable.\n", + "INFO:vertexai.preview.evaluation._evaluation:Computing metrics with a total of 10 Vertex Gen AI Evaluation Service API requests.\n", + "100%|██████████| 10/10 [00:13<00:00, 1.36s/it]\n", + "INFO:vertexai.preview.evaluation._evaluation:All 10 metric requests are successfully computed.\n", + "INFO:vertexai.preview.evaluation._evaluation:Evaluation Took:13.589168556999994 seconds\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "EXPERIMENT_NAME = \"evaluate-re-agent-response\" # @param {type:\"string\"}\n", + "\n", + "response_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "response_eval_result = response_eval_task.evaluate(runnable=agent_parsed_response)\n", + "\n", + "display_eval_report(response_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JtewTwiwg9qH" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "Predicted Trajectory: " - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cy0aRydrp9zW" + }, + "outputs": [ + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get price for smartphone
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "500
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Latency In Seconds: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.4945395349998307
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Failure: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Predicted Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "The response is safe, as it does not contain any hate speech, harassment, dangerous content, or sexually explicit content. It simply states the numeric value '500', which can represent the price of a smartphone, making it relevant to the user's prompt.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Coherence/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "STEP 1: The purpose of the prompt is to get the price of a smartphone. The intended audience is anyone looking to purchase a smartphone.\n", + "STEP 2: The response lacks any discernable structure or logical flow, as it's just a numerical value with no context.\n", + "STEP 3: Local coherence is nonexistent because there are no sentences or paragraphs to evaluate transitions.\n", + "STEP 4: The response doesn't utilize any word choices or syntax, hindering the establishment of connections between ideas.\n", + "STEP 5: Pronouns and references are absent in the response.\n", + "Therefore, the response receives the lowest rating due to the lack of any coherent elements.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Coherence/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get product details and price for headphones
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "50
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Latency In Seconds: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.8972680370000035
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Failure: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Predicted Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "The response only contains the number \"50\", which is not harmful or unsafe. Thus, it does not contain any hate speech, harassment, dangerous content, or sexually explicit content. Therefore, the response is safe.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Coherence/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "The response \"50\" lacks coherence entirely. It provides no product details, no currency for the price, and does not specify what the number 50 refers to. The user asked for product details and a price for headphones, but the response offers an isolated number without any context or relevance to the prompt. There is no logical flow, structure, or cohesion. The response does not fulfill the user's request or provide any meaningful information.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Coherence/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get details for usb charger
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "A super fast and light usb charger
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Latency In Seconds: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.5881808110000293
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Failure: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Predicted Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "The response is safe, as it does not contain any hate speech, harassment, dangerous content, or sexually explicit content.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Coherence/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "STEP 1: The prompt seeks detailed information about a USB charger. The intended audience might be anyone looking to purchase or learn more about USB chargers.\n", + "STEP 2: The response lacks a clear structure or progression of ideas. It consists of a single short, general statement, offering no details. There is no beginning, middle, or end, nor does it define 'super fast' or 'light.'\n", + "STEP 3: As the response is a single sentence, there are no transitions to assess, nor is there a paragraph structure to analyze. The sentence fails to provide details about a USB charger.\n", + "STEP 4: The word choice is simplistic and doesn't effectively convey information. Terms like 'super fast' and 'light' are vague and lack specific details.\n", + "STEP 5: There are no pronouns or references to assess.\n", + "Overall, the response is incoherent as it fails to provide the details requested by the prompt and lacks organization and structure.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Coherence/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ntRBK3Te6PEc" + }, + "source": [ + "### Evaluate generated response conditioned by tool choosing\n", + "\n", + "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", + "\n", + "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" - ], - "text/plain": [ - "" + "cell_type": "markdown", + "metadata": { + "id": "4bENwFcd6prX" + }, + "source": [ + "#### Define a custom metric\n", + "\n", + "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", + "\n", + "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "Trajectory Exact Match/Score: " - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "txGEHcg76riI" + }, + "outputs": [], + "source": [ + "criteria = {\n", + " \"Follows trajectory\": (\n", + " \"Evaluate whether the agent's response logically follows from the \"\n", + " \"sequence of actions it took. Consider these sub-points:\\n\"\n", + " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", + " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", + " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", + " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", + " )\n", + "}\n", + "\n", + "pointwise_rating_rubric = {\n", + " \"1\": \"Follows trajectory\",\n", + " \"0\": \"Does not follow trajectory\",\n", + "}\n", + "\n", + "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", + " criteria=criteria,\n", + " rating_rubric=pointwise_rating_rubric,\n", + " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8MJqXu0kikxd" + }, + "source": [ + "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5EL7iEDMikNQ" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# Instruction\n", + "You are an expert evaluator. Your task is to evaluate the quality of the responses generated by AI models. We will provide you with the user prompt and an AI-generated responses.\n", + "You should first read the user input carefully for analyzing the task, and then evaluate the quality of the responses based on the Criteria provided in the Evaluation section below.\n", + "You will assign the response a rating following the Rating Rubric and Evaluation Steps. Give step by step explanations for your rating, and only choose ratings from the Rating Rubric.\n", + "\n", + "\n", + "# Evaluation\n", + "## Criteria\n", + "Follows trajectory: Evaluate whether the agent's response logically follows from the sequence of actions it took. Consider these sub-points:\n", + " - Does the response reflect the information gathered during the trajectory?\n", + " - Is the response consistent with the goals and constraints of the task?\n", + " - Are there any unexpected or illogical jumps in reasoning?\n", + "Provide specific examples from the trajectory and response to support your evaluation.\n", + "\n", + "## Rating Rubric\n", + "0: Does not follow trajectory\n", + "1: Follows trajectory\n", + "\n", + "## Evaluation Steps\n", + "Step 1: Assess the response in aspects of all criteria provided. Provide assessment according to each criterion.\n", + "Step 2: Score based on the rating rubric. Give a brief rationale to explain your evaluation considering each individual criterion.\n", + "\n", + "\n", + "# User Inputs and AI-generated Response\n", + "## User Inputs\n", + "### predicted_trajectory\n", + "{predicted_trajectory}\n", + "\n", + "### prompt\n", + "{prompt}\n", + "\n", + "\n", + "\n", + "\n", + "## AI-generated Response\n", + "{response}\n" + ] + } + ], + "source": [ + "print(response_follows_trajectory_prompt_template.prompt_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e1djVp7Fi4Yy" + }, + "source": [ + "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "Trajectory In Order Match/Score: " - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nx1xbZD87iMj" + }, + "outputs": [], + "source": [ + "response_follows_trajectory_metric = PointwiseMetric(\n", + " metric=\"response_follows_trajectory\",\n", + " metric_prompt_template=response_follows_trajectory_prompt_template,\n", + ")" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" + "cell_type": "markdown", + "metadata": { + "id": "1pmxLwTe7Ywv" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "Set new generated response evaluation metrics by including the custom metric.\n" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "Safety/Explanation: " - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wrsbVFDd7Ywv" + }, + "outputs": [], + "source": [ + "response_tool_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"safety\",\n", + " response_follows_trajectory_metric,\n", + "]" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "The response is safe, as it does not contain any hate speech, harassment, dangerous content, or sexually explicit content.
" - ], - "text/plain": [ - "" + "cell_type": "markdown", + "metadata": { + "id": "Lo-Sza807Ywv" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation." ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "Safety/Score: " - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_dkb4gSn7Ywv" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/801452371447/locations/us-central1/metadataStores/default/contexts/evaluate-re-agent-response-by-tools-911730d1-06a8-4bde-9eeb-8f66d51217f8 to Experiment: evaluate-re-agent-response-by-tools\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment Run\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 5/5 [00:01<00:00, 2.56it/s]\n", + "INFO:vertexai.preview.evaluation._evaluation:All 5 responses are successfully generated from the runnable.\n", + "INFO:vertexai.preview.evaluation._evaluation:Computing metrics with a total of 20 Vertex Gen AI Evaluation Service API requests.\n", + "100%|██████████| 20/20 [00:21<00:00, 1.08s/it]\n", + "INFO:vertexai.preview.evaluation._evaluation:All 20 metric requests are successfully computed.\n", + "INFO:vertexai.preview.evaluation._evaluation:Evaluation Took:21.68623241199998 seconds\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### Summary Metrics" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"display_eval_report(response_eval_tool_result)\",\n \"rows\": 1,\n \"fields\": [\n {\n \"column\": \"row_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 5.0,\n \"max\": 5.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 5.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_exact_match/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_exact_match/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_in_order_match/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_in_order_match/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.6,\n \"max\": 0.6,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.6\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.5477225575051662,\n \"max\": 0.5477225575051662,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.5477225575051662\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.634030882800016,\n \"max\": 1.634030882800016,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.634030882800016\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.2428521800798761,\n \"max\": 0.2428521800798761,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.2428521800798761\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
row_counttrajectory_exact_match/meantrajectory_exact_match/stdtrajectory_in_order_match/meantrajectory_in_order_match/stdsafety/meansafety/stdresponse_follows_trajectory/meanresponse_follows_trajectory/stdlatency_in_seconds/meanlatency_in_seconds/stdfailure/meanfailure/std
05.01.00.01.00.01.00.00.60.5477231.6340310.2428520.00.0
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " row_count trajectory_exact_match/mean trajectory_exact_match/std \\\n", + "0 5.0 1.0 0.0 \n", + "\n", + " trajectory_in_order_match/mean trajectory_in_order_match/std safety/mean \\\n", + "0 1.0 0.0 1.0 \n", + "\n", + " safety/std response_follows_trajectory/mean \\\n", + "0 0.0 0.6 \n", + "\n", + " response_follows_trajectory/std latency_in_seconds/mean \\\n", + "0 0.547723 1.634031 \n", + "\n", + " latency_in_seconds/std failure/mean failure/std \n", + "0 0.242852 0.0 0.0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### Row-wise Metrics" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"display_eval_report(response_eval_tool_result)\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"prompt\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Get product details and price for headphones\",\n \"Get product details for speaker?\",\n \"Get details for usb charger\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"reference_trajectory\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"50\",\n \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n \"A super fast and light usb charger\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 1.3765636650000488,\n \"max\": 1.943170352999914,\n \"num_unique_values\": 5,\n \"samples\": [\n 1.8326374470000246,\n 1.3765636650000488,\n 1.5494367260000672\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 0,\n \"max\": 0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"predicted_trajectory\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_exact_match/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.0,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_in_order_match/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.0,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/explanation\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"The response, \\\"50\\\", does not contain any unsafe content. It is a simple numerical response, likely referring to a price, and doesn't exhibit hate speech, harassment, dangerous instructions, or sexually explicit material.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.0,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/explanation\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"The response \\\"50\\\" does not follow the trajectory. The predicted trajectory includes getting product details and price for headphones. The response only provides a number, which is not enough information. It doesn't say 50 what (dollars? euros? units in stock?). Additionally, we don't get any details as requested. Therefore, the response does not reflect the information that should have been gathered during the trajectory and thus doesn't follow it.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5477225575051662,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
promptreference_trajectoryresponselatency_in_secondsfailurepredicted_trajectorytrajectory_exact_match/scoretrajectory_in_order_match/scoresafety/explanationsafety/scoreresponse_follows_trajectory/explanationresponse_follows_trajectory/score
0Get price for smartphone[{'tool_name': 'get_product_price', 'tool_inpu...5001.4683460[{'tool_name': 'get_product_price', 'tool_inpu...1.01.0The response is a simple price and does not co...1.0The AI's response follows the trajectory becau...1.0
1Get product details and price for headphones[{'tool_name': 'get_product_details', 'tool_in...501.8326370[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response, \"50\", does not contain any unsaf...1.0The response \"50\" does not follow the trajecto...0.0
2Get details for usb charger[{'tool_name': 'get_product_details', 'tool_in...A super fast and light usb charger1.5494370[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response is safe. It does not contain any ...1.0The response \"A super fast and light usb charg...1.0
3Get product details and price for shoes[{'tool_name': 'get_product_details', 'tool_in...1001.943170[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response is a single number and does not p...1.0The response \"100\" does not follow the traject...0.0
4Get product details for speaker?[{'tool_name': 'get_product_details', 'tool_in...A voice-controlled smart speaker that plays mu...1.3765640[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response is safe. It provides information ...1.0The response provides a high-level description...1.0
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " prompt \\\n", + "0 Get price for smartphone \n", + "1 Get product details and price for headphones \n", + "2 Get details for usb charger \n", + "3 Get product details and price for shoes \n", + "4 Get product details for speaker? \n", + "\n", + " reference_trajectory \\\n", + "0 [{'tool_name': 'get_product_price', 'tool_inpu... \n", + "1 [{'tool_name': 'get_product_details', 'tool_in... \n", + "2 [{'tool_name': 'get_product_details', 'tool_in... \n", + "3 [{'tool_name': 'get_product_details', 'tool_in... \n", + "4 [{'tool_name': 'get_product_details', 'tool_in... \n", + "\n", + " response latency_in_seconds \\\n", + "0 500 1.468346 \n", + "1 50 1.832637 \n", + "2 A super fast and light usb charger 1.549437 \n", + "3 100 1.94317 \n", + "4 A voice-controlled smart speaker that plays mu... 1.376564 \n", + "\n", + " failure predicted_trajectory \\\n", + "0 0 [{'tool_name': 'get_product_price', 'tool_inpu... \n", + "1 0 [{'tool_name': 'get_product_details', 'tool_in... \n", + "2 0 [{'tool_name': 'get_product_details', 'tool_in... \n", + "3 0 [{'tool_name': 'get_product_details', 'tool_in... \n", + "4 0 [{'tool_name': 'get_product_details', 'tool_in... \n", + "\n", + " trajectory_exact_match/score trajectory_in_order_match/score \\\n", + "0 1.0 1.0 \n", + "1 1.0 1.0 \n", + "2 1.0 1.0 \n", + "3 1.0 1.0 \n", + "4 1.0 1.0 \n", + "\n", + " safety/explanation safety/score \\\n", + "0 The response is a simple price and does not co... 1.0 \n", + "1 The response, \"50\", does not contain any unsaf... 1.0 \n", + "2 The response is safe. It does not contain any ... 1.0 \n", + "3 The response is a single number and does not p... 1.0 \n", + "4 The response is safe. It provides information ... 1.0 \n", + "\n", + " response_follows_trajectory/explanation \\\n", + "0 The AI's response follows the trajectory becau... \n", + "1 The response \"50\" does not follow the trajecto... \n", + "2 The response \"A super fast and light usb charg... \n", + "3 The response \"100\" does not follow the traject... \n", + "4 The response provides a high-level description... \n", + "\n", + " response_follows_trajectory/score \n", + "0 1.0 \n", + "1 0.0 \n", + "2 1.0 \n", + "3 0.0 \n", + "4 1.0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "EXPERIMENT_NAME = \"evaluate-re-agent-response-by-tools\" # @param {type:\"string\"}\n", + "\n", + "response_eval_tool_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "response_eval_tool_result = response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_response\n", + ")\n", + "\n", + "display_eval_report(response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EFmnRBlWqJnC" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZODTRuq2lF75" + }, + "outputs": [ + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get price for smartphone
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "500
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Latency In Seconds: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.4945395349998307
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Failure: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Predicted Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "The response is safe, as it does not contain any hate speech, harassment, dangerous content, or sexually explicit content. It simply states the numeric value '500', which can represent the price of a smartphone, making it relevant to the user's prompt.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Coherence/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "STEP 1: The purpose of the prompt is to get the price of a smartphone. The intended audience is anyone looking to purchase a smartphone.\n", + "STEP 2: The response lacks any discernable structure or logical flow, as it's just a numerical value with no context.\n", + "STEP 3: Local coherence is nonexistent because there are no sentences or paragraphs to evaluate transitions.\n", + "STEP 4: The response doesn't utilize any word choices or syntax, hindering the establishment of connections between ideas.\n", + "STEP 5: Pronouns and references are absent in the response.\n", + "Therefore, the response receives the lowest rating due to the lack of any coherent elements.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Coherence/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get product details and price for headphones
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "50
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Latency In Seconds: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.8972680370000035
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Failure: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Predicted Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "The response only contains the number \"50\", which is not harmful or unsafe. Thus, it does not contain any hate speech, harassment, dangerous content, or sexually explicit content. Therefore, the response is safe.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Coherence/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "The response \"50\" lacks coherence entirely. It provides no product details, no currency for the price, and does not specify what the number 50 refers to. The user asked for product details and a price for headphones, but the response offers an isolated number without any context or relevance to the prompt. There is no logical flow, structure, or cohesion. The response does not fulfill the user's request or provide any meaningful information.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Coherence/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get details for usb charger
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "A super fast and light usb charger
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Latency In Seconds: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.5881808110000293
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Failure: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Predicted Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "The response is safe, as it does not contain any hate speech, harassment, dangerous content, or sexually explicit content.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Coherence/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "STEP 1: The prompt seeks detailed information about a USB charger. The intended audience might be anyone looking to purchase or learn more about USB chargers.\n", + "STEP 2: The response lacks a clear structure or progression of ideas. It consists of a single short, general statement, offering no details. There is no beginning, middle, or end, nor does it define 'super fast' or 'light.'\n", + "STEP 3: As the response is a single sentence, there are no transitions to assess, nor is there a paragraph structure to analyze. The sentence fails to provide details about a USB charger.\n", + "STEP 4: The word choice is simplistic and doesn't effectively convey information. Terms like 'super fast' and 'light' are vague and lack specific details.\n", + "STEP 5: There are no pronouns or references to assess.\n", + "Overall, the response is incoherent as it fails to provide the details requested by the prompt and lacks organization and structure.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Coherence/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4nuUDP3a2eTB" + }, + "source": [ + "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", + "\n", + "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "Response Follows Trajectory/Explanation: " - ], - "text/plain": [ - "" + "cell_type": "markdown", + "metadata": { + "id": "pNh3-NDuZGDl" + }, + "source": [ + "### Bring your own evaluation dataset\n", + "\n", + "Define the evaluation dataset with the predicted trajectory and the generated response." ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "The AI's response does not follow the predicted trajectory, which suggests using the \"get_product_details\" tool to look up information on a USB charger. Instead of returning product information, the AI offers a generic description: \"A super fast and light usb charger.\" This response doesn't demonstrate use of the tool or retrieval of specific product details.
" - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y9hBgsg324Ej" + }, + "outputs": [], + "source": [ + "byod_eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"generated_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"response\": [\n", + " 500,\n", + " 50,\n", + " \"A super fast and light usb charger\",\n", + " 100,\n", + " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " ],\n", + "}\n", + "\n", + "byod_eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oEYmU2eJ7q-1" + }, + "source": [ + "### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "Response Follows Trajectory/Score: " - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wBD-4wpB7q-3" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/801452371447/locations/us-central1/metadataStores/default/contexts/evaluate-re-agent-response-by-tools-response-over-tools-byod-crxo2pye to Experiment: evaluate-re-agent-response-by-tools\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment Run\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 5/5 [00:02<00:00, 1.93it/s]\n", + "INFO:vertexai.preview.evaluation._evaluation:All 5 responses are successfully generated from the runnable.\n", + "INFO:vertexai.preview.evaluation._evaluation:Computing metrics with a total of 20 Vertex Gen AI Evaluation Service API requests.\n", + "100%|██████████| 20/20 [00:22<00:00, 1.12s/it]\n", + "INFO:vertexai.preview.evaluation._evaluation:All 20 metric requests are successfully computed.\n", + "INFO:vertexai.preview.evaluation._evaluation:Evaluation Took:22.457164905000127 seconds\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " science\n", + " View Experiment\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### Summary Metrics" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"display_eval_report(byod_response_eval_tool_result)\",\n \"rows\": 1,\n \"fields\": [\n {\n \"column\": \"row_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 5.0,\n \"max\": 5.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 5.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_exact_match/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_exact_match/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_in_order_match/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_in_order_match/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.4,\n \"max\": 0.4,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.5477225575051662,\n \"max\": 0.5477225575051662,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.5477225575051662\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.8266308515999754,\n \"max\": 1.8266308515999754,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.8266308515999754\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.4469010777924883,\n \"max\": 0.4469010777924883,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.4469010777924883\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
row_counttrajectory_exact_match/meantrajectory_exact_match/stdtrajectory_in_order_match/meantrajectory_in_order_match/stdsafety/meansafety/stdresponse_follows_trajectory/meanresponse_follows_trajectory/stdlatency_in_seconds/meanlatency_in_seconds/stdfailure/meanfailure/std
05.01.00.01.00.01.00.00.40.5477231.8266310.4469010.00.0
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " row_count trajectory_exact_match/mean trajectory_exact_match/std \\\n", + "0 5.0 1.0 0.0 \n", + "\n", + " trajectory_in_order_match/mean trajectory_in_order_match/std safety/mean \\\n", + "0 1.0 0.0 1.0 \n", + "\n", + " safety/std response_follows_trajectory/mean \\\n", + "0 0.0 0.4 \n", + "\n", + " response_follows_trajectory/std latency_in_seconds/mean \\\n", + "0 0.547723 1.826631 \n", + "\n", + " latency_in_seconds/std failure/mean failure/std \n", + "0 0.446901 0.0 0.0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### Row-wise Metrics" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"display_eval_report(byod_response_eval_tool_result)\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"prompt\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Get product details and price for headphones\",\n \"Get product details for speaker?\",\n \"Get details for usb charger\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"reference_trajectory\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"50\",\n \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n \"A super fast and light usb charger\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 1.41932438799995,\n \"max\": 2.585738198999934,\n \"num_unique_values\": 5,\n \"samples\": [\n 1.7416313100000025,\n 1.41932438799995,\n 2.585738198999934\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 0,\n \"max\": 0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"predicted_trajectory\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_exact_match/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.0,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_in_order_match/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.0,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/explanation\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"The response is a simple \\\"50\\\" which is not harmful or unsafe in any way. It could be interpreted as a price, relevant to the prompt, but lacks detail. There is no hate speech, harassment, dangerous or sexually explicit content.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.0,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/explanation\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"The AI response \\\"50\\\" does not follow the trajectory. The predicted trajectory includes getting product details and price for headphones. Returning only \\\"50\\\" does not reflect the information that should have been gathered. It only provides what could be interpreted as a price, but lacks the product details and context.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5477225575051662,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
promptreference_trajectoryresponselatency_in_secondsfailurepredicted_trajectorytrajectory_exact_match/scoretrajectory_in_order_match/scoresafety/explanationsafety/scoreresponse_follows_trajectory/explanationresponse_follows_trajectory/score
0Get price for smartphone[{'tool_name': 'get_product_price', 'tool_inpu...5001.6097930[{'tool_name': 'get_product_price', 'tool_inpu...1.01.0The response is safe, as it does not contain a...1.0The AI's response follows the trajectory set b...1.0
1Get product details and price for headphones[{'tool_name': 'get_product_details', 'tool_in...501.7416310[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response is a simple \"50\" which is not har...1.0The AI response \"50\" does not follow the traje...0.0
2Get details for usb charger[{'tool_name': 'get_product_details', 'tool_in...A super fast and light usb charger2.5857380[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response is safe, as it does not contain a...1.0The AI's response does not follow the predicte...0.0
3Get product details and price for shoes[{'tool_name': 'get_product_details', 'tool_in...1001.7766670[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response is a number which doesn't promote...1.0The response \"100\" does not follow the traject...0.0
4Get product details for speaker?[{'tool_name': 'get_product_details', 'tool_in...A voice-controlled smart speaker that plays mu...1.4193240[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response is safe, as it does not contain a...1.0The response \"A voice-controlled smart speaker...1.0
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " prompt \\\n", + "0 Get price for smartphone \n", + "1 Get product details and price for headphones \n", + "2 Get details for usb charger \n", + "3 Get product details and price for shoes \n", + "4 Get product details for speaker? \n", + "\n", + " reference_trajectory \\\n", + "0 [{'tool_name': 'get_product_price', 'tool_inpu... \n", + "1 [{'tool_name': 'get_product_details', 'tool_in... \n", + "2 [{'tool_name': 'get_product_details', 'tool_in... \n", + "3 [{'tool_name': 'get_product_details', 'tool_in... \n", + "4 [{'tool_name': 'get_product_details', 'tool_in... \n", + "\n", + " response latency_in_seconds \\\n", + "0 500 1.609793 \n", + "1 50 1.741631 \n", + "2 A super fast and light usb charger 2.585738 \n", + "3 100 1.776667 \n", + "4 A voice-controlled smart speaker that plays mu... 1.419324 \n", + "\n", + " failure predicted_trajectory \\\n", + "0 0 [{'tool_name': 'get_product_price', 'tool_inpu... \n", + "1 0 [{'tool_name': 'get_product_details', 'tool_in... \n", + "2 0 [{'tool_name': 'get_product_details', 'tool_in... \n", + "3 0 [{'tool_name': 'get_product_details', 'tool_in... \n", + "4 0 [{'tool_name': 'get_product_details', 'tool_in... \n", + "\n", + " trajectory_exact_match/score trajectory_in_order_match/score \\\n", + "0 1.0 1.0 \n", + "1 1.0 1.0 \n", + "2 1.0 1.0 \n", + "3 1.0 1.0 \n", + "4 1.0 1.0 \n", + "\n", + " safety/explanation safety/score \\\n", + "0 The response is safe, as it does not contain a... 1.0 \n", + "1 The response is a simple \"50\" which is not har... 1.0 \n", + "2 The response is safe, as it does not contain a... 1.0 \n", + "3 The response is a number which doesn't promote... 1.0 \n", + "4 The response is safe, as it does not contain a... 1.0 \n", + "\n", + " response_follows_trajectory/explanation \\\n", + "0 The AI's response follows the trajectory set b... \n", + "1 The AI response \"50\" does not follow the traje... \n", + "2 The AI's response does not follow the predicte... \n", + "3 The response \"100\" does not follow the traject... \n", + "4 The response \"A voice-controlled smart speaker... \n", + "\n", + " response_follows_trajectory/score \n", + "0 1.0 \n", + "1 0.0 \n", + "2 0.0 \n", + "3 0.0 \n", + "4 1.0 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", + "\n", + "byod_response_eval_tool_task = EvalTask(\n", + " dataset=byod_eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_response, experiment_run_name=EXPERIMENT_RUN_NAME\n", + ")\n", + "\n", + "display_eval_report(byod_response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9eU3LG6r7q-3" + }, + "source": [ + "### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample.\n" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "0.0
" - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pQFzmd2I7q-3" + }, + "outputs": [ + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get price for smartphone
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "500
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Latency In Seconds: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.6097934590000023
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Failure: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Predicted Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Exact Match/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory In Order Match/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "The response is safe, as it does not contain any hate speech, harassment, dangerous content, or sexually explicit content.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response Follows Trajectory/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "The AI's response follows the trajectory set by the predicted trajectory. The trajectory indicates that the AI should use the \"get_product_price\" tool with \"smartphone\" as input. Based on the response \"500\", we can infer that the AI successfully executed this action and returned the price. Thus, the response directly reflects the information gathered during the trajectory by using the specified tool and input. The response is consistent with the user's prompt to \"Get price for smartphone\" as it provides a numerical value which can be interpreted as a price. There are no unexpected jumps in reasoning, making the response logical and relevant. Therefore, a rating of \"1\" is assigned, indicating that the AI's response follows the trajectory.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response Follows Trajectory/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get product details and price for headphones
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "50
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Latency In Seconds: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.7416313100000025
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Failure: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Predicted Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Exact Match/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory In Order Match/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "The response is a simple \"50\" which is not harmful or unsafe in any way. It could be interpreted as a price, relevant to the prompt, but lacks detail. There is no hate speech, harassment, dangerous or sexually explicit content.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response Follows Trajectory/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "The AI response \"50\" does not follow the trajectory. The predicted trajectory includes getting product details and price for headphones. Returning only \"50\" does not reflect the information that should have been gathered. It only provides what could be interpreted as a price, but lacks the product details and context.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response Follows Trajectory/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Prompt: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Get details for usb charger
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Reference Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "A super fast and light usb charger
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Latency In Seconds: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "2.585738198999934
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Failure: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Predicted Trajectory: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory Exact Match/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Trajectory In Order Match/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "The response is safe, as it does not contain any hate speech, harassment, dangerous content, or sexually explicit content.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Safety/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "1.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response Follows Trajectory/Explanation: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "The AI's response does not follow the predicted trajectory, which suggests using the \"get_product_details\" tool to look up information on a USB charger. Instead of returning product information, the AI offers a generic description: \"A super fast and light usb charger.\" This response doesn't demonstrate use of the tool or retrieval of specific product details.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response Follows Trajectory/Score: " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "0.0
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0FEbvEOkZS8f" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display_radar_plot(\n", + " byod_response_eval_tool_result,\n", + " title=\"Response Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2a4e033321ad" + }, + "source": [ + "## Cleaning up\n" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ox2I3UfRlTOd" + }, + "outputs": [], + "source": [ + "delete_experiment = True\n", + "delete_remote_agent = True\n", + "\n", + "if delete_experiment:\n", + " try:\n", + " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", + " experiment.delete(delete_backing_tensorboard_runs=True)\n", + " except Exception as e:\n", + " print(e)\n", + "\n", + "if delete_remote_agent:\n", + " try:\n", + " remote_custom_agent.delete()\n", + " except Exception as e:\n", + " print(e)" ] - }, - "metadata": {}, - "output_type": "display_data" } - ], - "source": [ - "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0FEbvEOkZS8f" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "
\n", - "
\n", - "\n", - "" - ] - }, - "metadata": {}, - "output_type": "display_data" + ], + "metadata": { + "colab": { + "name": "evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" } - ], - "source": [ - "display_radar_plot(\n", - " byod_response_eval_tool_result,\n", - " title=\"Response Metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2a4e033321ad" - }, - "source": [ - "## Cleaning up\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Ox2I3UfRlTOd" - }, - "outputs": [], - "source": [ - "delete_experiment = True\n", - "delete_remote_agent = True\n", - "\n", - "if delete_experiment:\n", - " try:\n", - " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", - " experiment.delete(delete_backing_tensorboard_runs=True)\n", - " except Exception as e:\n", - " print(e)\n", - "\n", - "if delete_remote_agent:\n", - " try:\n", - " remote_custom_agent.delete()\n", - " except Exception as e:\n", - " print(e)" - ] - } - ], - "metadata": { - "colab": { - "name": "evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb", - "toc_visible": true - }, - "environment": { - "kernel": "python3", - "name": "common-cpu.m126", - "type": "gcloud", - "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/base-cpu:m126" - }, - "kernelspec": { - "display_name": "Python 3 (Local)", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 4 + "nbformat": 4, + "nbformat_minor": 0 } From 25045b9bd0e7d8c6f6f00744308299f62568ba21 Mon Sep 17 00:00:00 2001 From: inardini Date: Wed, 18 Dec 2024 10:11:17 +0000 Subject: [PATCH 04/14] clean output --- .../evaluation/evaluating_crewai_agent.ipynb | 3161 ++--- .../evaluating_langgraph_agent.ipynb | 3137 ++--- ...reasoning_engine_customized_template.ipynb | 3407 ++--- ...t_reasoning_engine_prebuilt_template.ipynb | 3073 ++--- ...reasoning_engine_customized_template.ipynb | 10806 +++------------- 5 files changed, 8136 insertions(+), 15448 deletions(-) diff --git a/gemini/evaluation/evaluating_crewai_agent.ipynb b/gemini/evaluation/evaluating_crewai_agent.ipynb index 56700ff712..3dffdfa464 100644 --- a/gemini/evaluation/evaluating_crewai_agent.ipynb +++ b/gemini/evaluation/evaluating_crewai_agent.ipynb @@ -1,1573 +1,1592 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ur8xi4C7S06n" - }, - "outputs": [], - "source": [ - "# Copyright 2024 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JAPoU8Sm5E6e" - }, - "source": [ - "# Evaluating Agents - Evaluate a CrewAI agent with Vertex AI Gen AI Evaluation\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \"Google
Open in Colab\n", - "
\n", - "
\n", - " \n", - " \"Google
Open in Colab Enterprise\n", - "
\n", - "
\n", - " \n", - " \"Vertex
Open in Vertex AI Workbench\n", - "
\n", - "
\n", - " \n", - " \"GitHub
View on GitHub\n", - "
\n", - "
\n", - "\n", - "
\n", - "\n", - "Share to:\n", - "\n", - "\n", - " \"LinkedIn\n", - "\n", - "\n", - "\n", - " \"Bluesky\n", - "\n", - "\n", - "\n", - " \"X\n", - "\n", - "\n", - "\n", - " \"Reddit\n", - "\n", - "\n", - "\n", - " \"Facebook\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "84f0f73a0f76" - }, - "source": [ - "| | |\n", - "|-|-|\n", - "| Authors | [Ivan Nardini](https://github.com/inardini), [Naveksha Sood](https://github.com/navekshasood) |" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tvgnzT1CKxrO" - }, - "source": [ - "## Overview\n", - "\n", - "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", - "\n", - "Vertex AI Model Evaluation provides a toolkit of quality controlled and explainable methods and metrics to evaluate any generative model or application, including agents, and benchmark the evaluation results against your own judgment, using your own evaluation criteria.\n", - "\n", - "This tutorial shows how to evaluate a Crew AI agent using Vertex AI Gen AI Evaluation for agent evaluation.\n", - "\n", - "The tutorial uses the following Google Cloud services and resources:\n", - "\n", - "* Vertex AI Gen AI Evaluation\n", - "\n", - "The steps performed include:\n", - "\n", - "* Build local agent using Crew AI\n", - "* Prepare Agent Evaluation dataset\n", - "* Single tool usage evaluation\n", - "* Trajectory evaluation\n", - "* Response evaluation\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "61RBz8LLbxCR" - }, - "source": [ - "## Get started" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "No17Cw5hgx12" - }, - "source": [ - "### Install Vertex AI SDK and other required packages\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tFy3H3aPgx12" - }, - "outputs": [], - "source": [ - "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation]\" \\\n", - " \"crewai\" \"crewai-tools\" \\\n", - " \"cloudpickle==3.0.0\" \\\n", - " \"pydantic==2.7.4\" \\\n", - " \"requests\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "R5Xep4W9lq-Z" - }, - "source": [ - "### Restart runtime\n", - "\n", - "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", - "\n", - "The restart might take a minute or longer. After it's restarted, continue to the next step." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XRvKdaPDTznN" - }, - "outputs": [], - "source": [ - "import IPython\n", - "\n", - "app = IPython.Application.instance()\n", - "app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SbmM4z7FOBpM" - }, - "source": [ - "
\n", - "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", - "
\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dmWOrTJ3gx13" - }, - "source": [ - "### Authenticate your notebook environment (Colab only)\n", - "\n", - "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NyKGtVQjgx13" - }, - "outputs": [], - "source": [ - "import sys\n", - "\n", - "if \"google.colab\" in sys.modules:\n", - " from google.colab import auth\n", - "\n", - " auth.authenticate_user()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF4l8DTdWgPY" - }, - "source": [ - "### Set Google Cloud project information and initialize Vertex AI SDK\n", - "\n", - "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", - "\n", - "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nqwi-5ufWp_B" - }, - "outputs": [], - "source": [ - "# Use the environment variable if the user doesn't provide Project ID.\n", - "import os\n", - "\n", - "import vertexai\n", - "\n", - "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", - "\n", - "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", - " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", - "os.environ[\"GOOGLE_CLOUD_PROJECT\"] = PROJECT_ID\n", - "\n", - "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", - "\n", - "EXPERIMENT_NAME = \"evaluate-crewai-agent\" # @param {type:\"string\"}\n", - "\n", - "vertexai.init(project=PROJECT_ID, location=LOCATION, experiment=EXPERIMENT_NAME)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5303c05f7aa6" - }, - "source": [ - "## Import libraries\n", - "\n", - "Import tutorial libraries." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6fc324893334" - }, - "outputs": [], - "source": [ - "# General\n", - "import random\n", - "import string\n", - "import warnings\n", - "\n", - "from IPython.display import HTML, Markdown, display\n", - "import pandas as pd\n", - "import plotly.graph_objects as go\n", - "\n", - "warnings.filterwarnings(\"ignore\", category=Warning, module=\"opentelemetry.trace\")\n", - "\n", - "# Build agent\n", - "from crewai import Agent, Crew, Process, Task\n", - "from crewai.flow.flow import Flow, listen, start\n", - "from crewai_tools import tool\n", - "\n", - "# Evaluate agent\n", - "from google.cloud import aiplatform\n", - "from vertexai.preview.evaluation import EvalTask\n", - "from vertexai.preview.evaluation.metrics import (\n", - " PointwiseMetric,\n", - " PointwiseMetricPromptTemplate,\n", - " TrajectorySingleToolUse,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MVnBDX54gz7j" - }, - "source": [ - "## Define helper functions\n", - "\n", - "Initiate a set of helper functions to print tutorial results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uSgWjMD_g1_v" - }, - "outputs": [], - "source": [ - "def get_id(length: int = 8) -> str:\n", - " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", - " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", - "\n", - "\n", - "def parse_crewai_output_to_dictionary(crew, crew_output):\n", - " \"\"\"\n", - " Parse CrewAI output into a structured dictionary format.\n", - " \"\"\"\n", - " final_output = {\"response\": str(crew_output), \"predicted_trajectory\": []}\n", - "\n", - " try:\n", - " # Access tools_results directly from each agent\n", - " for agent in crew.agents:\n", - " if hasattr(agent, \"tools_results\"):\n", - " for tool_result in agent.tools_results:\n", - " tool_info = {\n", - " \"tool_name\": tool_result.get(\"tool_name\", \"\"),\n", - " \"tool_input\": tool_result.get(\"tool_args\", {}),\n", - " }\n", - " final_output[\"predicted_trajectory\"].append(tool_info)\n", - "\n", - " except Exception as e:\n", - " final_output[\"error\"] = f\"Error parsing tools results: {str(e)}\"\n", - "\n", - " return final_output\n", - "\n", - "\n", - "def format_output_as_markdown(output: dict) -> str:\n", - " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", - " markdown = \"### AI Response\\n\"\n", - " markdown += f\"{output['response']}\\n\\n\"\n", - "\n", - " if output[\"predicted_trajectory\"]:\n", - " markdown += \"### Function Calls\\n\"\n", - " for call in output[\"predicted_trajectory\"]:\n", - " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", - " markdown += \" - **Arguments**:\\n\"\n", - " for key, value in call[\"tool_input\"].items():\n", - " markdown += f\" - `{key}`: `{value}`\\n\"\n", - "\n", - " return markdown\n", - "\n", - "\n", - "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", - " \"\"\"Display the evaluation results.\"\"\"\n", - " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", - " display(Markdown(\"### Summary Metrics\"))\n", - " display(metrics_df)\n", - "\n", - " display(Markdown(f\"### Row-wise Metrics\"))\n", - " display(eval_result.metrics_table)\n", - "\n", - "\n", - "def display_drilldown(row: pd.Series) -> None:\n", - " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", - "\n", - " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", - "\n", - " if not (\n", - " isinstance(row[\"predicted_trajectory\"], list)\n", - " and isinstance(row[\"reference_trajectory\"], list)\n", - " ):\n", - " return\n", - "\n", - " for predicted_trajectory, reference_trajectory in zip(\n", - " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", - " ):\n", - " display(\n", - " HTML(\n", - " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", - " )\n", - " )\n", - "\n", - " if not (\n", - " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", - " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", - " ):\n", - " continue\n", - "\n", - " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", - " print(\"Tool Input Key: \", tool_input_key)\n", - "\n", - " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", - " print(\n", - " \"Tool Values: \",\n", - " predicted_trajectory[\"tool_input\"][tool_input_key],\n", - " reference_trajectory[\"tool_input\"][tool_input_key],\n", - " )\n", - " else:\n", - " print(\n", - " \"Tool Values: \",\n", - " predicted_trajectory[\"tool_input\"][tool_input_key],\n", - " \"N/A\",\n", - " )\n", - " print(\"\\n\")\n", - " display(HTML(\"
\"))\n", - "\n", - "\n", - "def display_dataframe_rows(\n", - " df: pd.DataFrame,\n", - " columns: list[str] | None = None,\n", - " num_rows: int = 3,\n", - " display_drilldown: bool = False,\n", - ") -> None:\n", - " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", - "\n", - " if columns:\n", - " df = df[columns]\n", - "\n", - " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", - " header_style = base_style + \"font-weight: bold;\"\n", - "\n", - " for _, row in df.head(num_rows).iterrows():\n", - " for column in df.columns:\n", - " display(\n", - " HTML(\n", - " f\"{column.replace('_', ' ').title()}: \"\n", - " )\n", - " )\n", - " display(HTML(f\"{row[column]}
\"))\n", - "\n", - " display(HTML(\"
\"))\n", - "\n", - " if (\n", - " display_drilldown\n", - " and \"predicted_trajectory\" in df.columns\n", - " and \"reference_trajectory\" in df.columns\n", - " ):\n", - " display_drilldown(row)\n", - "\n", - "\n", - "def plot_bar_plot(\n", - " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", - ") -> None:\n", - " fig = go.Figure()\n", - " data = []\n", - "\n", - " summary_metrics = eval_result.summary_metrics\n", - " if metrics:\n", - " summary_metrics = {\n", - " k: summary_metrics[k]\n", - " for k, v in summary_metrics.items()\n", - " if any(selected_metric in k for selected_metric in metrics)\n", - " }\n", - "\n", - " data.append(\n", - " go.Bar(\n", - " x=list(summary_metrics.keys()),\n", - " y=list(summary_metrics.values()),\n", - " name=title,\n", - " )\n", - " )\n", - "\n", - " fig = go.Figure(data=data)\n", - "\n", - " # Change the bar mode\n", - " fig.update_layout(barmode=\"group\")\n", - " fig.show()\n", - "\n", - "\n", - "def display_radar_plot(eval_results, title: str, metrics=None):\n", - " \"\"\"Plot the radar plot.\"\"\"\n", - " fig = go.Figure()\n", - " summary_metrics = eval_results.summary_metrics\n", - " if metrics:\n", - " summary_metrics = {\n", - " k: summary_metrics[k]\n", - " for k, v in summary_metrics.items()\n", - " if any(selected_metric in k for selected_metric in metrics)\n", - " }\n", - "\n", - " min_val = min(summary_metrics.values())\n", - " max_val = max(summary_metrics.values())\n", - "\n", - " fig.add_trace(\n", - " go.Scatterpolar(\n", - " r=list(summary_metrics.values()),\n", - " theta=list(summary_metrics.keys()),\n", - " fill=\"toself\",\n", - " name=title,\n", - " )\n", - " )\n", - " fig.update_layout(\n", - " title=title,\n", - " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", - " showlegend=True,\n", - " )\n", - " fig.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bDaa2Mtsifmq" - }, - "source": [ - "## Build CrewAI agent\n", - "\n", - "Build your application using CrewAI, including the Gemini model and custom tools that you define." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KHwShhpOitKp" - }, - "source": [ - "### Set tools\n", - "\n", - "To start, set the tools that a customer support agent needs to do their job." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gA2ZKvfeislw" - }, - "outputs": [], - "source": [ - "@tool\n", - "def get_product_details(product_name: str):\n", - " \"\"\"Gathers basic details about a product.\"\"\"\n", - " details = {\n", - " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", - " \"usb charger\": \"A super fast and light usb charger\",\n", - " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", - " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", - " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", - " }\n", - " return details.get(product_name, \"Product details not found.\")\n", - "\n", - "\n", - "@tool\n", - "def get_product_price(product_name: str):\n", - " \"\"\"Gathers price about a product.\"\"\"\n", - " details = {\n", - " \"smartphone\": 500,\n", - " \"usb charger\": 10,\n", - " \"shoes\": 100,\n", - " \"headphones\": 50,\n", - " \"speaker\": 80,\n", - " }\n", - " return details.get(product_name, \"Product price not found.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "be70714d9fae" - }, - "source": [ - "### Define router using Flow\n", - "\n", - "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "516b5108d327" - }, - "outputs": [], - "source": [ - "class ProductFlow(Flow):\n", - " @start\n", - " def begin_flow(self):\n", - " \"\"\"Starts the product information flow\"\"\"\n", - " return \"check_request\"\n", - "\n", - " @listen(\"check_request\")\n", - " def router(self, state: dict) -> str:\n", - " \"\"\"Routes the product request to appropriate handler\"\"\"\n", - " # Get the last message from the state\n", - " last_message = state.get(\"last_message\", {})\n", - " tool_calls = last_message.get(\"tool_calls\", [])\n", - "\n", - " if tool_calls:\n", - " function_name = tool_calls[0].get(\"name\")\n", - " if function_name == \"get_product_price\":\n", - " return \"get_product_price\"\n", - " else:\n", - " return \"get_product_details\"\n", - " return \"end\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "l4mk5XPui4Y1" - }, - "source": [ - "### Set the model\n", - "\n", - "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "BaYeo6K2i-w1" - }, - "outputs": [], - "source": [ - "model = \"vertex_ai/gemini-1.5-pro-002\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tNlAY9cojEWz" - }, - "source": [ - "### Assemble the agent\n", - "\n", - "The Vertex AI GenAI Evaluation works directly with 'Queryable' agents, and also lets you add your own custom functions with a specific structure (signature).\n", - "\n", - "In this case, you assemble the agent using a custom function. The function triggers the agent for a given input and parse the agent outcome to extract the response and called tools." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dAFdi7SujGP8" - }, - "outputs": [], - "source": [ - "def agent_parsed_outcome(input):\n", - " product_researcher = Agent(\n", - " role=\"Product Researcher\",\n", - " goal=\"Research product details and prices accurately\",\n", - " backstory=\"Expert at gathering and analyzing product information\",\n", - " llm=model,\n", - " tools=[get_product_details, get_product_price],\n", - " allow_delegation=False,\n", - " )\n", - "\n", - " # Create task based on the input\n", - " research_task = Task(\n", - " description=f\"Analyze this user request: '{input}'. \"\n", - " f\"If the request is about price, use get_product_price tool. \"\n", - " f\"Otherwise, use get_product_details tool to get product information.\",\n", - " expected_output=\"Product information including details and/or price based on the user request.\",\n", - " agent=product_researcher,\n", - " )\n", - "\n", - " # Create crew with sequential process\n", - " crew = Crew(\n", - " agents=[product_researcher],\n", - " tasks=[research_task],\n", - " process=Process.sequential,\n", - " )\n", - "\n", - " result = crew.kickoff()\n", - " return parse_crewai_output_to_dictionary(crew, result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_HGcs6PVjRj_" - }, - "source": [ - "### Test the agent\n", - "\n", - "Query your agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "lGb58OJkjUs9" - }, - "outputs": [], - "source": [ - "response = agent_parsed_outcome(input=\"Get product details for shoes\")\n", - "display(Markdown(format_output_as_markdown(response)))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2wCFstt8w4Dx" - }, - "outputs": [], - "source": [ - "response = agent_parsed_outcome(input=\"Get product price for shoes\")\n", - "display(Markdown(format_output_as_markdown(response)))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aOGPePsorpUl" - }, - "source": [ - "## Evaluating a CrewAI agent with Vertex AI Gen AI Evaluation\n", - "\n", - "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", - "\n", - "Monitoring focuses on how well your agent is performing specific tasks:\n", - "\n", - "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", - "\n", - "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", - "\n", - "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", - "\n", - "Observability is about understanding the overall health of the agent:\n", - "\n", - "* **Latency**: How long does it take the agent to respond?\n", - "\n", - "* **Failure Rate**: How often does the agent fail to produce a response?\n", - "\n", - "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e43229f3ad4f" - }, - "source": [ - "### Prepare Agent Evaluation dataset\n", - "\n", - "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", - "\n", - "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", - "\n", - "> Optionally, you can provide both generated responses and predicted trajectory (**Bring-Your-Own-Dataset scenario**).\n", - "\n", - "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fFf8uTdUiDt3" - }, - "outputs": [], - "source": [ - "eval_data = {\n", - " \"prompt\": [\n", - " \"Get price for smartphone\",\n", - " \"Get product details and price for headphones\",\n", - " \"Get details for usb charger\",\n", - " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\",\n", - " ],\n", - " \"reference_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - "}\n", - "\n", - "eval_sample_dataset = pd.DataFrame(eval_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PQEI1EcfvFHb" - }, - "source": [ - "Print some samples from the dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "EjsonqWWvIvE" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(eval_sample_dataset, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "m4CvBuf1afHG" - }, - "source": [ - "### Single tool usage evaluation\n", - "\n", - "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_rS5GGKHd5bx" - }, - "source": [ - "#### Set single tool usage metrics\n", - "\n", - "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", - "\n", - "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xixvq8dwd5by" - }, - "outputs": [], - "source": [ - "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ktKZoT2Qd5by" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using agent_parsed_outcome function and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SRv43fDcd5by" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"\n", - "\n", - "single_tool_call_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=single_tool_usage_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", - " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(single_tool_call_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6o5BjSTFKVMS" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Use some helper functions to visualize a sample of evaluation result." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1Jopzw83k14w" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JlujdJpu5Kn6" - }, - "source": [ - "### Trajectory Evaluation\n", - "\n", - "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8s-nHdDJneHM" - }, - "source": [ - "#### Set trajectory metrics\n", - "\n", - "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", - "\n", - "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", - "\n", - "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", - "\n", - "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", - "\n", - "* `trajectory_precision`: proportion of predicted actions present in reference\n", - "\n", - "* `trajectory_recall`: proportion of reference actions present in predicted. \n", - "\n", - "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "c32WIS95neHN" - }, - "outputs": [], - "source": [ - "trajectory_metrics = [\n", - " \"trajectory_exact_match\",\n", - " \"trajectory_in_order_match\",\n", - " \"trajectory_any_order_match\",\n", - " \"trajectory_precision\",\n", - " \"trajectory_recall\",\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF3jhTH3neHN" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vOdS7TJUneHN" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"trajectory-{get_id()}\"\n", - "\n", - "trajectory_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", - ")\n", - "\n", - "trajectory_eval_result = trajectory_eval_task.evaluate(\n", - " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(trajectory_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DBiUI3LyLBtj" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Print and visualize a sample of evaluation results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "z7-LdM3mLBtk" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sLVRdN5llA0h" - }, - "outputs": [], - "source": [ - "plot_bar_plot(\n", - " trajectory_eval_result,\n", - " title=\"Trajectory Metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "T8TipU2akHEd" - }, - "source": [ - "### Evaluate final response\n", - "\n", - "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DeK-py7ykkDN" - }, - "source": [ - "#### Set response metrics\n", - "\n", - "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", - "\n", - "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cyGHGgeVklvz" - }, - "outputs": [], - "source": [ - "response_metrics = [\"safety\", \"coherence\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DaBJWcg1kn55" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wRb2EC_hknSD" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"response-{get_id()}\"\n", - "\n", - "response_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", - ")\n", - "\n", - "response_eval_result = response_eval_task.evaluate(\n", - " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(response_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JtewTwiwg9qH" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "\n", - "Print new evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZODTRuq2lF75" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ntRBK3Te6PEc" - }, - "source": [ - "### Evaluate generated response conditioned by tool choosing\n", - "\n", - "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", - "\n", - "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4bENwFcd6prX" - }, - "source": [ - "#### Define a custom metric\n", - "\n", - "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", - "\n", - "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "txGEHcg76riI" - }, - "outputs": [], - "source": [ - "criteria = {\n", - " \"Follows trajectory\": (\n", - " \"Evaluate whether the agent's response logically follows from the \"\n", - " \"sequence of actions it took. Consider these sub-points:\\n\"\n", - " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", - " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", - " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", - " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", - " )\n", - "}\n", - "\n", - "pointwise_rating_rubric = {\n", - " \"1\": \"Follows trajectory\",\n", - " \"0\": \"Does not follow trajectory\",\n", - "}\n", - "\n", - "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", - " criteria=criteria,\n", - " rating_rubric=pointwise_rating_rubric,\n", - " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8MJqXu0kikxd" - }, - "source": [ - "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5EL7iEDMikNQ" - }, - "outputs": [], - "source": [ - "print(response_follows_trajectory_prompt_template.prompt_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e1djVp7Fi4Yy" - }, - "source": [ - "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nx1xbZD87iMj" - }, - "outputs": [], - "source": [ - "response_follows_trajectory_metric = PointwiseMetric(\n", - " metric=\"response_follows_trajectory\",\n", - " metric_prompt_template=response_follows_trajectory_prompt_template,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1pmxLwTe7Ywv" - }, - "source": [ - "#### Set response metrics\n", - "\n", - "Set new generated response evaluation metrics by including the custom metric.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wrsbVFDd7Ywv" - }, - "outputs": [], - "source": [ - "response_tool_metrics = [\n", - " \"trajectory_exact_match\",\n", - " \"trajectory_in_order_match\",\n", - " \"safety\",\n", - " response_follows_trajectory_metric,\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Lo-Sza807Ywv" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "Run a new agent's evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_dkb4gSn7Ywv" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"response-over-tools-{get_id()}\"\n", - "\n", - "response_eval_tool_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "response_eval_tool_result = response_eval_tool_task.evaluate(\n", - " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(response_eval_tool_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AtOfIFi2j88g" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Visualize evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GH2YvXgLlLH7" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(response_eval_tool_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tdVhCURXMdLG" - }, - "outputs": [], - "source": [ - "plot_bar_plot(\n", - " response_eval_tool_result,\n", - " title=\"Response Metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4nuUDP3a2eTB" - }, - "source": [ - "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", - "\n", - "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DRLKlmWd27PK" - }, - "source": [ - "### Bring your own evaluation dataset\n", - "\n", - "Define the evaluation dataset with the predicted trajectory and the generated response." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "y9hBgsg324Ej" - }, - "outputs": [], - "source": [ - "byod_eval_data = {\n", - " \"prompt\": [\n", - " \"Get price for smartphone\",\n", - " \"Get product details and price for headphones\",\n", - " \"Get details for usb charger\",\n", - " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\",\n", - " ],\n", - " \"reference_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - " \"generated_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - " \"response\": [\n", - " 500,\n", - " 50,\n", - " \"A super fast and light usb charger\",\n", - " 100,\n", - " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", - " ],\n", - "}\n", - "\n", - "byod_eval_sample_dataset = pd.DataFrame(eval_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oEYmU2eJ7q-1" - }, - "source": [ - "### Run an evaluation task\n", - "\n", - "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wBD-4wpB7q-3" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", - "\n", - "byod_response_eval_tool_task = EvalTask(\n", - " dataset=byod_eval_sample_dataset,\n", - " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", - " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN_NAME\n", - ")\n", - "\n", - "display_eval_report(byod_response_eval_tool_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9eU3LG6r7q-3" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Visualize evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pQFzmd2I7q-3" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "84HiPDOkPseW" - }, - "outputs": [], - "source": [ - "display_radar_plot(\n", - " byod_response_eval_tool_result,\n", - " title=\"Response Metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2a4e033321ad" - }, - "source": [ - "## Cleaning up\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Ox2I3UfRlTOd" - }, - "outputs": [], - "source": [ - "delete_experiment = True\n", - "\n", - "if delete_experiment:\n", - " try:\n", - " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", - " experiment.delete(delete_backing_tensorboard_runs=True)\n", - " except Exception as e:\n", - " print(e)" - ] - } - ], - "metadata": { - "colab": { - "name": "evaluating_crewai_agent.ipynb", - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Evaluating Agents - Evaluate a CrewAI agent with Vertex AI Gen AI Evaluation\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Google
Open in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Open in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "Share to:\n", + "\n", + "\n", + " \"LinkedIn\n", + "\n", + "\n", + "\n", + " \"Bluesky\n", + "\n", + "\n", + "\n", + " \"X\n", + "\n", + "\n", + "\n", + " \"Reddit\n", + "\n", + "\n", + "\n", + " \"Facebook\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "84f0f73a0f76" + }, + "source": [ + "| | |\n", + "|-|-|\n", + "| Authors | [Ivan Nardini](https://github.com/inardini), [Naveksha Sood](https://github.com/navekshasood) |" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", + "\n", + "Vertex AI Model Evaluation provides a toolkit of quality controlled and explainable methods and metrics to evaluate any generative model or application, including agents, and benchmark the evaluation results against your own judgment, using your own evaluation criteria.\n", + "\n", + "This tutorial shows how to evaluate a Crew AI agent using Vertex AI Gen AI Evaluation for agent evaluation.\n", + "\n", + "The tutorial uses the following Google Cloud services and resources:\n", + "\n", + "* Vertex AI Gen AI Evaluation\n", + "\n", + "The steps performed include:\n", + "\n", + "* Build local agent using Crew AI\n", + "* Prepare Agent Evaluation dataset\n", + "* Single tool usage evaluation\n", + "* Trajectory evaluation\n", + "* Response evaluation\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "61RBz8LLbxCR" + }, + "source": [ + "## Get started" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "No17Cw5hgx12" + }, + "source": [ + "### Install Vertex AI SDK and other required packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tFy3H3aPgx12" + }, + "outputs": [], + "source": [ + "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation]\" \\\n", + " \"crewai\" \"crewai-tools\" \\\n", + " \"cloudpickle==3.0.0\" \\\n", + " \"pydantic==2.7.4\" \\\n", + " \"requests\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R5Xep4W9lq-Z" + }, + "source": [ + "### Restart runtime\n", + "\n", + "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", + "\n", + "The restart might take a minute or longer. After it's restarted, continue to the next step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XRvKdaPDTznN" + }, + "outputs": [], + "source": [ + "import IPython\n", + "\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SbmM4z7FOBpM" + }, + "source": [ + "
\n", + "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dmWOrTJ3gx13" + }, + "source": [ + "### Authenticate your notebook environment (Colab only)\n", + "\n", + "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NyKGtVQjgx13" + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " from google.colab import auth\n", + "\n", + " auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF4l8DTdWgPY" + }, + "source": [ + "### Set Google Cloud project information and initialize Vertex AI SDK\n", + "\n", + "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nqwi-5ufWp_B" + }, + "outputs": [], + "source": [ + "# Use the environment variable if the user doesn't provide Project ID.\n", + "import os\n", + "\n", + "import vertexai\n", + "\n", + "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", + "\n", + "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", + " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", + "os.environ[\"GOOGLE_CLOUD_PROJECT\"] = PROJECT_ID\n", + "\n", + "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", + "\n", + "EXPERIMENT_NAME = \"evaluate-crewai-agent\" # @param {type:\"string\"}\n", + "\n", + "vertexai.init(project=PROJECT_ID, location=LOCATION, experiment=EXPERIMENT_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5303c05f7aa6" + }, + "source": [ + "## Import libraries\n", + "\n", + "Import tutorial libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6fc324893334" + }, + "outputs": [], + "source": [ + "# General\n", + "import random\n", + "import string\n", + "import warnings\n", + "\n", + "from IPython.display import HTML, Markdown, display\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "\n", + "warnings.filterwarnings(\"ignore\", category=Warning, module=\"opentelemetry.trace\")\n", + "\n", + "# Build agent\n", + "from crewai import Agent, Crew, Process, Task\n", + "from crewai.flow.flow import Flow, listen, start\n", + "from crewai_tools import tool\n", + "\n", + "# Evaluate agent\n", + "from google.cloud import aiplatform\n", + "from vertexai.preview.evaluation import EvalTask\n", + "from vertexai.preview.evaluation.metrics import (\n", + " PointwiseMetric,\n", + " PointwiseMetricPromptTemplate,\n", + " TrajectorySingleToolUse,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MVnBDX54gz7j" + }, + "source": [ + "## Define helper functions\n", + "\n", + "Initiate a set of helper functions to print tutorial results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uSgWjMD_g1_v" + }, + "outputs": [], + "source": [ + "def get_id(length: int = 8) -> str:\n", + " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", + " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", + "\n", + "\n", + "def parse_crewai_output_to_dictionary(crew, crew_output):\n", + " \"\"\"\n", + " Parse CrewAI output into a structured dictionary format.\n", + " \"\"\"\n", + " final_output = {\"response\": str(crew_output), \"predicted_trajectory\": []}\n", + "\n", + " try:\n", + " # Access tools_results directly from each agent\n", + " for agent in crew.agents:\n", + " if hasattr(agent, \"tools_results\"):\n", + " for tool_result in agent.tools_results:\n", + " tool_info = {\n", + " \"tool_name\": tool_result.get(\"tool_name\", \"\"),\n", + " \"tool_input\": tool_result.get(\"tool_args\", {}),\n", + " }\n", + " final_output[\"predicted_trajectory\"].append(tool_info)\n", + "\n", + " except Exception as e:\n", + " final_output[\"error\"] = f\"Error parsing tools results: {str(e)}\"\n", + "\n", + " return final_output\n", + "\n", + "\n", + "def format_output_as_markdown(output: dict) -> str:\n", + " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", + " markdown = \"### AI Response\\n\"\n", + " markdown += f\"{output['response']}\\n\\n\"\n", + "\n", + " if output[\"predicted_trajectory\"]:\n", + " markdown += \"### Function Calls\\n\"\n", + " for call in output[\"predicted_trajectory\"]:\n", + " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", + " markdown += \" - **Arguments**:\\n\"\n", + " for key, value in call[\"tool_input\"].items():\n", + " markdown += f\" - `{key}`: `{value}`\\n\"\n", + "\n", + " return markdown\n", + "\n", + "\n", + "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", + " \"\"\"Display the evaluation results.\"\"\"\n", + " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", + " display(Markdown(\"### Summary Metrics\"))\n", + " display(metrics_df)\n", + "\n", + " display(Markdown(f\"### Row-wise Metrics\"))\n", + " display(eval_result.metrics_table)\n", + "\n", + "\n", + "def display_drilldown(row: pd.Series) -> None:\n", + " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", + "\n", + " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", + "\n", + " if not (\n", + " isinstance(row[\"predicted_trajectory\"], list)\n", + " and isinstance(row[\"reference_trajectory\"], list)\n", + " ):\n", + " return\n", + "\n", + " for predicted_trajectory, reference_trajectory in zip(\n", + " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", + " ):\n", + " display(\n", + " HTML(\n", + " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", + " )\n", + " )\n", + "\n", + " if not (\n", + " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", + " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", + " ):\n", + " continue\n", + "\n", + " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", + " print(\"Tool Input Key: \", tool_input_key)\n", + "\n", + " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " reference_trajectory[\"tool_input\"][tool_input_key],\n", + " )\n", + " else:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " \"N/A\",\n", + " )\n", + " print(\"\\n\")\n", + " display(HTML(\"
\"))\n", + "\n", + "\n", + "def display_dataframe_rows(\n", + " df: pd.DataFrame,\n", + " columns: list[str] | None = None,\n", + " num_rows: int = 3,\n", + " display_drilldown: bool = False,\n", + ") -> None:\n", + " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", + "\n", + " if columns:\n", + " df = df[columns]\n", + "\n", + " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", + " header_style = base_style + \"font-weight: bold;\"\n", + "\n", + " for _, row in df.head(num_rows).iterrows():\n", + " for column in df.columns:\n", + " display(\n", + " HTML(\n", + " f\"{column.replace('_', ' ').title()}: \"\n", + " )\n", + " )\n", + " display(HTML(f\"{row[column]}
\"))\n", + "\n", + " display(HTML(\"
\"))\n", + "\n", + " if (\n", + " display_drilldown\n", + " and \"predicted_trajectory\" in df.columns\n", + " and \"reference_trajectory\" in df.columns\n", + " ):\n", + " display_drilldown(row)\n", + "\n", + "\n", + "def plot_bar_plot(\n", + " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", + ") -> None:\n", + " fig = go.Figure()\n", + " data = []\n", + "\n", + " summary_metrics = eval_result.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " data.append(\n", + " go.Bar(\n", + " x=list(summary_metrics.keys()),\n", + " y=list(summary_metrics.values()),\n", + " name=title,\n", + " )\n", + " )\n", + "\n", + " fig = go.Figure(data=data)\n", + "\n", + " # Change the bar mode\n", + " fig.update_layout(barmode=\"group\")\n", + " fig.show()\n", + "\n", + "\n", + "def display_radar_plot(eval_results, title: str, metrics=None):\n", + " \"\"\"Plot the radar plot.\"\"\"\n", + " fig = go.Figure()\n", + " summary_metrics = eval_results.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " min_val = min(summary_metrics.values())\n", + " max_val = max(summary_metrics.values())\n", + "\n", + " fig.add_trace(\n", + " go.Scatterpolar(\n", + " r=list(summary_metrics.values()),\n", + " theta=list(summary_metrics.keys()),\n", + " fill=\"toself\",\n", + " name=title,\n", + " )\n", + " )\n", + " fig.update_layout(\n", + " title=title,\n", + " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", + " showlegend=True,\n", + " )\n", + " fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bDaa2Mtsifmq" + }, + "source": [ + "## Build CrewAI agent\n", + "\n", + "Build your application using CrewAI, including the Gemini model and custom tools that you define." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KHwShhpOitKp" + }, + "source": [ + "### Set tools\n", + "\n", + "To start, set the tools that a customer support agent needs to do their job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gA2ZKvfeislw" + }, + "outputs": [], + "source": [ + "@tool\n", + "def get_product_details(product_name: str):\n", + " \"\"\"Gathers basic details about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", + " \"usb charger\": \"A super fast and light usb charger\",\n", + " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", + " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", + " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " }\n", + " return details.get(product_name, \"Product details not found.\")\n", + "\n", + "\n", + "@tool\n", + "def get_product_price(product_name: str):\n", + " \"\"\"Gathers price about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": 500,\n", + " \"usb charger\": 10,\n", + " \"shoes\": 100,\n", + " \"headphones\": 50,\n", + " \"speaker\": 80,\n", + " }\n", + " return details.get(product_name, \"Product price not found.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "be70714d9fae" + }, + "source": [ + "### Define router using Flow\n", + "\n", + "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "516b5108d327" + }, + "outputs": [], + "source": [ + "class ProductFlow(Flow):\n", + " @start\n", + " def begin_flow(self):\n", + " \"\"\"Starts the product information flow\"\"\"\n", + " return \"check_request\"\n", + "\n", + " @listen(\"check_request\")\n", + " def router(self, state: dict) -> str:\n", + " \"\"\"Routes the product request to appropriate handler\"\"\"\n", + " # Get the last message from the state\n", + " last_message = state.get(\"last_message\", {})\n", + " tool_calls = last_message.get(\"tool_calls\", [])\n", + "\n", + " if tool_calls:\n", + " function_name = tool_calls[0].get(\"name\")\n", + " if function_name == \"get_product_price\":\n", + " return \"get_product_price\"\n", + " else:\n", + " return \"get_product_details\"\n", + " return \"end\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l4mk5XPui4Y1" + }, + "source": [ + "### Set the model\n", + "\n", + "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BaYeo6K2i-w1" + }, + "outputs": [], + "source": [ + "model = \"vertex_ai/gemini-1.5-pro-002\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNlAY9cojEWz" + }, + "source": [ + "### Assemble the agent\n", + "\n", + "The Vertex AI GenAI Evaluation works directly with 'Queryable' agents, and also lets you add your own custom functions with a specific structure (signature).\n", + "\n", + "In this case, you assemble the agent using a custom function. The function triggers the agent for a given input and parse the agent outcome to extract the response and called tools." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dAFdi7SujGP8" + }, + "outputs": [], + "source": [ + "def agent_parsed_outcome(input):\n", + " product_researcher = Agent(\n", + " role=\"Product Researcher\",\n", + " goal=\"Research product details and prices accurately\",\n", + " backstory=\"Expert at gathering and analyzing product information\",\n", + " llm=model,\n", + " tools=[get_product_details, get_product_price],\n", + " allow_delegation=False,\n", + " )\n", + "\n", + " # Create task based on the input\n", + " research_task = Task(\n", + " description=f\"Analyze this user request: '{input}'. \"\n", + " f\"If the request is about price, use get_product_price tool. \"\n", + " f\"Otherwise, use get_product_details tool to get product information.\",\n", + " expected_output=\"Product information including details and/or price based on the user request.\",\n", + " agent=product_researcher,\n", + " )\n", + "\n", + " # Create crew with sequential process\n", + " crew = Crew(\n", + " agents=[product_researcher],\n", + " tasks=[research_task],\n", + " process=Process.sequential,\n", + " )\n", + "\n", + " result = crew.kickoff()\n", + " return parse_crewai_output_to_dictionary(crew, result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_HGcs6PVjRj_" + }, + "source": [ + "### Test the agent\n", + "\n", + "Query your agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lGb58OJkjUs9" + }, + "outputs": [], + "source": [ + "response = agent_parsed_outcome(input=\"Get product details for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2wCFstt8w4Dx" + }, + "outputs": [], + "source": [ + "response = agent_parsed_outcome(input=\"Get product price for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aOGPePsorpUl" + }, + "source": [ + "## Evaluating a CrewAI agent with Vertex AI Gen AI Evaluation\n", + "\n", + "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", + "\n", + "Monitoring focuses on how well your agent is performing specific tasks:\n", + "\n", + "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", + "\n", + "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", + "\n", + "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", + "\n", + "Observability is about understanding the overall health of the agent:\n", + "\n", + "* **Latency**: How long does it take the agent to respond?\n", + "\n", + "* **Failure Rate**: How often does the agent fail to produce a response?\n", + "\n", + "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e43229f3ad4f" + }, + "source": [ + "### Prepare Agent Evaluation dataset\n", + "\n", + "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", + "\n", + "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", + "\n", + "> Optionally, you can provide both generated responses and predicted trajectory (**Bring-Your-Own-Dataset scenario**).\n", + "\n", + "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fFf8uTdUiDt3" + }, + "outputs": [], + "source": [ + "eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + "}\n", + "\n", + "eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PQEI1EcfvFHb" + }, + "source": [ + "Print some samples from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EjsonqWWvIvE" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(eval_sample_dataset, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m4CvBuf1afHG" + }, + "source": [ + "### Single tool usage evaluation\n", + "\n", + "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_rS5GGKHd5bx" + }, + "source": [ + "#### Set single tool usage metrics\n", + "\n", + "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", + "\n", + "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xixvq8dwd5by" + }, + "outputs": [], + "source": [ + "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ktKZoT2Qd5by" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using agent_parsed_outcome function and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SRv43fDcd5by" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"\n", + "\n", + "single_tool_call_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=single_tool_usage_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(single_tool_call_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6o5BjSTFKVMS" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Use some helper functions to visualize a sample of evaluation result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1Jopzw83k14w" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JlujdJpu5Kn6" + }, + "source": [ + "### Trajectory Evaluation\n", + "\n", + "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8s-nHdDJneHM" + }, + "source": [ + "#### Set trajectory metrics\n", + "\n", + "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", + "\n", + "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", + "\n", + "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", + "\n", + "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", + "\n", + "* `trajectory_precision`: proportion of predicted actions present in reference\n", + "\n", + "* `trajectory_recall`: proportion of reference actions present in predicted. \n", + "\n", + "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c32WIS95neHN" + }, + "outputs": [], + "source": [ + "trajectory_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"trajectory_any_order_match\",\n", + " \"trajectory_precision\",\n", + " \"trajectory_recall\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF3jhTH3neHN" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vOdS7TJUneHN" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"trajectory-{get_id()}\"\n", + "\n", + "trajectory_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "trajectory_eval_result = trajectory_eval_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(trajectory_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DBiUI3LyLBtj" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Print and visualize a sample of evaluation results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "z7-LdM3mLBtk" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sLVRdN5llA0h" + }, + "outputs": [], + "source": [ + "plot_bar_plot(\n", + " trajectory_eval_result,\n", + " title=\"Trajectory Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T8TipU2akHEd" + }, + "source": [ + "### Evaluate final response\n", + "\n", + "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DeK-py7ykkDN" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", + "\n", + "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cyGHGgeVklvz" + }, + "outputs": [], + "source": [ + "response_metrics = [\"safety\", \"coherence\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DaBJWcg1kn55" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wRb2EC_hknSD" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"response-{get_id()}\"\n", + "\n", + "response_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "response_eval_result = response_eval_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(response_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JtewTwiwg9qH" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZODTRuq2lF75" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ntRBK3Te6PEc" + }, + "source": [ + "### Evaluate generated response conditioned by tool choosing\n", + "\n", + "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", + "\n", + "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4bENwFcd6prX" + }, + "source": [ + "#### Define a custom metric\n", + "\n", + "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", + "\n", + "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "txGEHcg76riI" + }, + "outputs": [], + "source": [ + "criteria = {\n", + " \"Follows trajectory\": (\n", + " \"Evaluate whether the agent's response logically follows from the \"\n", + " \"sequence of actions it took. Consider these sub-points:\\n\"\n", + " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", + " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", + " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", + " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", + " )\n", + "}\n", + "\n", + "pointwise_rating_rubric = {\n", + " \"1\": \"Follows trajectory\",\n", + " \"0\": \"Does not follow trajectory\",\n", + "}\n", + "\n", + "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", + " criteria=criteria,\n", + " rating_rubric=pointwise_rating_rubric,\n", + " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8MJqXu0kikxd" + }, + "source": [ + "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5EL7iEDMikNQ" + }, + "outputs": [], + "source": [ + "print(response_follows_trajectory_prompt_template.prompt_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e1djVp7Fi4Yy" + }, + "source": [ + "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nx1xbZD87iMj" + }, + "outputs": [], + "source": [ + "response_follows_trajectory_metric = PointwiseMetric(\n", + " metric=\"response_follows_trajectory\",\n", + " metric_prompt_template=response_follows_trajectory_prompt_template,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1pmxLwTe7Ywv" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "Set new generated response evaluation metrics by including the custom metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wrsbVFDd7Ywv" + }, + "outputs": [], + "source": [ + "response_tool_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"safety\",\n", + " response_follows_trajectory_metric,\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lo-Sza807Ywv" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_dkb4gSn7Ywv" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"response-over-tools-{get_id()}\"\n", + "\n", + "response_eval_tool_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "response_eval_tool_result = response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AtOfIFi2j88g" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GH2YvXgLlLH7" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tdVhCURXMdLG" + }, + "outputs": [], + "source": [ + "plot_bar_plot(\n", + " response_eval_tool_result,\n", + " title=\"Response Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4nuUDP3a2eTB" + }, + "source": [ + "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", + "\n", + "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DRLKlmWd27PK" + }, + "source": [ + "### Bring your own evaluation dataset\n", + "\n", + "Define the evaluation dataset with the predicted trajectory and the generated response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y9hBgsg324Ej" + }, + "outputs": [], + "source": [ + "byod_eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"generated_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"response\": [\n", + " 500,\n", + " 50,\n", + " \"A super fast and light usb charger\",\n", + " 100,\n", + " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " ],\n", + "}\n", + "\n", + "byod_eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oEYmU2eJ7q-1" + }, + "source": [ + "### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wBD-4wpB7q-3" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", + "\n", + "byod_response_eval_tool_task = EvalTask(\n", + " dataset=byod_eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN_NAME\n", + ")\n", + "\n", + "display_eval_report(byod_response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9eU3LG6r7q-3" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pQFzmd2I7q-3" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "84HiPDOkPseW" + }, + "outputs": [], + "source": [ + "display_radar_plot(\n", + " byod_response_eval_tool_result,\n", + " title=\"Response Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2a4e033321ad" + }, + "source": [ + "## Cleaning up\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ox2I3UfRlTOd" + }, + "outputs": [], + "source": [ + "delete_experiment = True\n", + "\n", + "if delete_experiment:\n", + " try:\n", + " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", + " experiment.delete(delete_backing_tensorboard_runs=True)\n", + " except Exception as e:\n", + " print(e)" + ] + } + ], + "metadata": { + "colab": { + "name": "evaluating_crewai_agent.ipynb", + "toc_visible": true + }, + "environment": { + "kernel": "python3", + "name": "common-cpu.m126", + "type": "gcloud", + "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/base-cpu:m126" + }, + "kernelspec": { + "display_name": "Python 3 (Local)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } diff --git a/gemini/evaluation/evaluating_langgraph_agent.ipynb b/gemini/evaluation/evaluating_langgraph_agent.ipynb index cb3ee72425..c17a6ba09e 100644 --- a/gemini/evaluation/evaluating_langgraph_agent.ipynb +++ b/gemini/evaluation/evaluating_langgraph_agent.ipynb @@ -1,1561 +1,1580 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ur8xi4C7S06n" - }, - "outputs": [], - "source": [ - "# Copyright 2024 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JAPoU8Sm5E6e" - }, - "source": [ - "# Evaluating Agents - Evaluate an LangGraph agent with Vertex AI Gen AI Evaluation\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \"Google
Open in Colab\n", - "
\n", - "
\n", - " \n", - " \"Google
Open in Colab Enterprise\n", - "
\n", - "
\n", - " \n", - " \"Vertex
Open in Vertex AI Workbench\n", - "
\n", - "
\n", - " \n", - " \"GitHub
View on GitHub\n", - "
\n", - "
\n", - "\n", - "
\n", - "\n", - "Share to:\n", - "\n", - "\n", - " \"LinkedIn\n", - "\n", - "\n", - "\n", - " \"Bluesky\n", - "\n", - "\n", - "\n", - " \"X\n", - "\n", - "\n", - "\n", - " \"Reddit\n", - "\n", - "\n", - "\n", - " \"Facebook\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "84f0f73a0f76" - }, - "source": [ - "| | |\n", - "|-|-|\n", - "| Authors | [Ivan Nardini](https://github.com/inardini) [Naveksha Sood](https://github.com/navekshasood)|" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tvgnzT1CKxrO" - }, - "source": [ - "## Overview\n", - "\n", - "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", - "\n", - "Vertex AI Model Evaluation provides a toolkit of quality controlled and explainable methods and metrics to evaluate any generative model or application, including agents, and benchmark the evaluation results against your own judgment, using your own evaluation criteria.\n", - "\n", - "This tutorial shows how to evaluate a LangGraph agent using Vertex AI Gen AI Evaluation for agent evaluation.\n", - "\n", - "The tutorial uses the following Google Cloud services and resources:\n", - "\n", - "* Vertex AI Gen AI Evaluation\n", - "\n", - "The steps performed include:\n", - "\n", - "* Build local agent using LangGraph\n", - "* Prepare Agent Evaluation dataset\n", - "* Single tool usage evaluation\n", - "* Trajectory evaluation\n", - "* Response evaluation\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "61RBz8LLbxCR" - }, - "source": [ - "## Get started" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "No17Cw5hgx12" - }, - "source": [ - "### Install Vertex AI SDK and other required packages\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tFy3H3aPgx12" - }, - "outputs": [], - "source": [ - "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation]\" \\\n", - " \"langchain_google_vertexai\" \\\n", - " \"langgraph\" \\\n", - " \"cloudpickle==3.0.0\" \\\n", - " \"pydantic==2.7.4\" \\\n", - " \"requests\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "R5Xep4W9lq-Z" - }, - "source": [ - "### Restart runtime\n", - "\n", - "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", - "\n", - "The restart might take a minute or longer. After it's restarted, continue to the next step." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XRvKdaPDTznN" - }, - "outputs": [], - "source": [ - "import IPython\n", - "\n", - "app = IPython.Application.instance()\n", - "app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SbmM4z7FOBpM" - }, - "source": [ - "
\n", - "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", - "
\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dmWOrTJ3gx13" - }, - "source": [ - "### Authenticate your notebook environment (Colab only)\n", - "\n", - "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NyKGtVQjgx13" - }, - "outputs": [], - "source": [ - "import sys\n", - "\n", - "if \"google.colab\" in sys.modules:\n", - " from google.colab import auth\n", - "\n", - " auth.authenticate_user()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF4l8DTdWgPY" - }, - "source": [ - "### Set Google Cloud project information and initialize Vertex AI SDK\n", - "\n", - "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", - "\n", - "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nqwi-5ufWp_B" - }, - "outputs": [], - "source": [ - "# Use the environment variable if the user doesn't provide Project ID.\n", - "import os\n", - "\n", - "import vertexai\n", - "\n", - "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", - "\n", - "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", - " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", - "\n", - "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", - "\n", - "EXPERIMENT_NAME = \"evaluate-langgraph-agent\" # @param {type:\"string\"}\n", - "\n", - "vertexai.init(project=PROJECT_ID, location=LOCATION, experiment=EXPERIMENT_NAME)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5303c05f7aa6" - }, - "source": [ - "## Import libraries\n", - "\n", - "Import tutorial libraries." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6fc324893334" - }, - "outputs": [], - "source": [ - "# General\n", - "import random\n", - "import string\n", - "from typing import Literal\n", - "\n", - "from IPython.display import HTML, Markdown, display\n", - "\n", - "# Evaluate agent\n", - "from google.cloud import aiplatform\n", - "from langchain.load import dump as langchain_load_dump\n", - "\n", - "# Build agent\n", - "from langchain_core.messages import BaseMessage, HumanMessage\n", - "from langchain_core.tools import tool\n", - "from langchain_google_vertexai import ChatVertexAI\n", - "from langgraph.graph import END, MessageGraph\n", - "from langgraph.prebuilt import ToolNode\n", - "import pandas as pd\n", - "import plotly.graph_objects as go\n", - "from vertexai.preview.evaluation import EvalTask\n", - "from vertexai.preview.evaluation.metrics import (\n", - " PointwiseMetric,\n", - " PointwiseMetricPromptTemplate,\n", - " TrajectorySingleToolUse,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MVnBDX54gz7j" - }, - "source": [ - "## Define helper functions\n", - "\n", - "Initiate a set of helper functions to print tutorial results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uSgWjMD_g1_v" - }, - "outputs": [], - "source": [ - "def get_id(length: int = 8) -> str:\n", - " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", - " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", - "\n", - "\n", - "def parse_messages_to_output_dictionary(messages: list[dict]) -> dict:\n", - " \"\"\"Parse response and function calls from a list of messages in the constructor format.\"\"\"\n", - "\n", - " final_output = {\n", - " \"response\": \"No AI response found in the message history.\",\n", - " \"predicted_trajectory\": [],\n", - " }\n", - "\n", - " # Process each message\n", - " function_calls = []\n", - " for message in messages:\n", - " # Check if it's a Tool message which contains the actual response\n", - " if message.get(\"type\") == \"constructor\" and \"ToolMessage\" in message.get(\n", - " \"id\", []\n", - " ):\n", - " final_output[\"response\"] = message[\"kwargs\"][\"content\"]\n", - "\n", - " # Check if it's an AI message to get tool calls\n", - " elif message.get(\"type\") == \"constructor\" and \"AIMessage\" in message.get(\n", - " \"id\", []\n", - " ):\n", - " tool_calls = message[\"kwargs\"].get(\"tool_calls\", [])\n", - " for tool_call in tool_calls:\n", - " if tool_call:\n", - " function_calls.append(\n", - " {\n", - " \"tool_name\": tool_call.get(\"name\"),\n", - " \"tool_input\": tool_call.get(\"args\"),\n", - " }\n", - " )\n", - "\n", - " final_output[\"predicted_trajectory\"] = function_calls\n", - " return final_output\n", - "\n", - "\n", - "def format_output_as_markdown(output: dict) -> str:\n", - " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", - " markdown = \"### AI Response\\n\"\n", - " markdown += f\"{output['response']}\\n\\n\"\n", - "\n", - " if output[\"predicted_trajectory\"]:\n", - " markdown += \"### Function Calls\\n\"\n", - " for call in output[\"predicted_trajectory\"]:\n", - " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", - " markdown += \" - **Arguments**:\\n\"\n", - " for key, value in call[\"tool_input\"].items():\n", - " markdown += f\" - `{key}`: `{value}`\\n\"\n", - "\n", - " return markdown\n", - "\n", - "\n", - "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", - " \"\"\"Display the evaluation results.\"\"\"\n", - " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", - " display(Markdown(\"### Summary Metrics\"))\n", - " display(metrics_df)\n", - "\n", - " display(Markdown(f\"### Row-wise Metrics\"))\n", - " display(eval_result.metrics_table)\n", - "\n", - "\n", - "def display_drilldown(row: pd.Series) -> None:\n", - " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", - "\n", - " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", - "\n", - " if not (\n", - " isinstance(row[\"predicted_trajectory\"], list)\n", - " and isinstance(row[\"reference_trajectory\"], list)\n", - " ):\n", - " return\n", - "\n", - " for predicted_trajectory, reference_trajectory in zip(\n", - " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", - " ):\n", - " display(\n", - " HTML(\n", - " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", - " )\n", - " )\n", - "\n", - " if not (\n", - " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", - " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", - " ):\n", - " continue\n", - "\n", - " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", - " print(\"Tool Input Key: \", tool_input_key)\n", - "\n", - " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", - " print(\n", - " \"Tool Values: \",\n", - " predicted_trajectory[\"tool_input\"][tool_input_key],\n", - " reference_trajectory[\"tool_input\"][tool_input_key],\n", - " )\n", - " else:\n", - " print(\n", - " \"Tool Values: \",\n", - " predicted_trajectory[\"tool_input\"][tool_input_key],\n", - " \"N/A\",\n", - " )\n", - " print(\"\\n\")\n", - " display(HTML(\"
\"))\n", - "\n", - "\n", - "def display_dataframe_rows(\n", - " df: pd.DataFrame,\n", - " columns: list[str] | None = None,\n", - " num_rows: int = 3,\n", - " display_drilldown: bool = False,\n", - ") -> None:\n", - " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", - "\n", - " if columns:\n", - " df = df[columns]\n", - "\n", - " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", - " header_style = base_style + \"font-weight: bold;\"\n", - "\n", - " for _, row in df.head(num_rows).iterrows():\n", - " for column in df.columns:\n", - " display(\n", - " HTML(\n", - " f\"{column.replace('_', ' ').title()}: \"\n", - " )\n", - " )\n", - " display(HTML(f\"{row[column]}
\"))\n", - "\n", - " display(HTML(\"
\"))\n", - "\n", - " if (\n", - " display_drilldown\n", - " and \"predicted_trajectory\" in df.columns\n", - " and \"reference_trajectory\" in df.columns\n", - " ):\n", - " display_drilldown(row)\n", - "\n", - "\n", - "def plot_bar_plot(\n", - " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", - ") -> None:\n", - " fig = go.Figure()\n", - " data = []\n", - "\n", - " summary_metrics = eval_result.summary_metrics\n", - " if metrics:\n", - " summary_metrics = {\n", - " k: summary_metrics[k]\n", - " for k, v in summary_metrics.items()\n", - " if any(selected_metric in k for selected_metric in metrics)\n", - " }\n", - "\n", - " data.append(\n", - " go.Bar(\n", - " x=list(summary_metrics.keys()),\n", - " y=list(summary_metrics.values()),\n", - " name=title,\n", - " )\n", - " )\n", - "\n", - " fig = go.Figure(data=data)\n", - "\n", - " # Change the bar mode\n", - " fig.update_layout(barmode=\"group\")\n", - " fig.show()\n", - "\n", - "\n", - "def display_radar_plot(eval_results, title: str, metrics=None):\n", - " \"\"\"Plot the radar plot.\"\"\"\n", - " fig = go.Figure()\n", - " summary_metrics = eval_results.summary_metrics\n", - " if metrics:\n", - " summary_metrics = {\n", - " k: summary_metrics[k]\n", - " for k, v in summary_metrics.items()\n", - " if any(selected_metric in k for selected_metric in metrics)\n", - " }\n", - "\n", - " min_val = min(summary_metrics.values())\n", - " max_val = max(summary_metrics.values())\n", - "\n", - " fig.add_trace(\n", - " go.Scatterpolar(\n", - " r=list(summary_metrics.values()),\n", - " theta=list(summary_metrics.keys()),\n", - " fill=\"toself\",\n", - " name=title,\n", - " )\n", - " )\n", - " fig.update_layout(\n", - " title=title,\n", - " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", - " showlegend=True,\n", - " )\n", - " fig.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bDaa2Mtsifmq" - }, - "source": [ - "## Build LangGraph agent\n", - "\n", - "Build your application using LangGraph, including the Gemini model, custom tools that you define and a router to control the conversational flow." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KHwShhpOitKp" - }, - "source": [ - "### Set tools\n", - "\n", - "To start, set the tools that a customer support agent needs to do their job." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gA2ZKvfeislw" - }, - "outputs": [], - "source": [ - "@tool\n", - "def get_product_details(product_name: str):\n", - " \"\"\"Gathers basic details about a product.\"\"\"\n", - " details = {\n", - " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", - " \"usb charger\": \"A super fast and light usb charger\",\n", - " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", - " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", - " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", - " }\n", - " return details.get(product_name, \"Product details not found.\")\n", - "\n", - "\n", - "@tool\n", - "def get_product_price(product_name: str):\n", - " \"\"\"Gathers price about a product.\"\"\"\n", - " details = {\n", - " \"smartphone\": 500,\n", - " \"usb charger\": 10,\n", - " \"shoes\": 100,\n", - " \"headphones\": 50,\n", - " \"speaker\": 80,\n", - " }\n", - " return details.get(product_name, \"Product price not found.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "be70714d9fae" - }, - "source": [ - "### Define router\n", - "\n", - "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "516b5108d327" - }, - "outputs": [], - "source": [ - "def router(\n", - " state: list[BaseMessage],\n", - ") -> Literal[\"get_product_details\", \"get_product_price\", \"__end__\"]:\n", - " \"\"\"Initiates product details or price retrieval if the user asks for a product.\"\"\"\n", - " # Get the tool_calls from the last message in the conversation history.\n", - " tool_calls = state[-1].tool_calls\n", - "\n", - " # If there are any tool_calls\n", - " if tool_calls:\n", - " # Check the function name in the first tool call\n", - " function_name = tool_calls[0].get(\"name\")\n", - " if function_name == \"get_product_price\":\n", - " return \"get_product_price\"\n", - " else:\n", - " return \"get_product_details\"\n", - " else:\n", - " # End the conversation flow.\n", - " return \"__end__\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "l4mk5XPui4Y1" - }, - "source": [ - "### Set the model\n", - "\n", - "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "BaYeo6K2i-w1" - }, - "outputs": [], - "source": [ - "model = \"gemini-1.5-pro\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tNlAY9cojEWz" - }, - "source": [ - "### Assemble the agent\n", - "\n", - "The Vertex AI GenAI Evaluation works directly with 'Queryable' agents, and also lets you add your own custom functions with a specific structure (signature).\n", - "\n", - "In this case, you assemble the agent using a custom function. The function triggers the agent for a given input and parse the agent outcome to extract the response and called tools." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dAFdi7SujGP8" - }, - "outputs": [], - "source": [ - "def agent_parsed_outcome(input):\n", - "\n", - " model = ChatVertexAI(model=model)\n", - " builder = MessageGraph()\n", - "\n", - " model_with_tools = model.bind_tools([get_product_details, get_product_price])\n", - " builder.add_node(\"tools\", model_with_tools)\n", - "\n", - " tool_node = ToolNode([get_product_details, get_product_price])\n", - " builder.add_node(\"get_product_details\", tool_node)\n", - " builder.add_node(\"get_product_price\", tool_node)\n", - " builder.add_edge(\"get_product_details\", END)\n", - " builder.add_edge(\"get_product_price\", END)\n", - "\n", - " builder.set_entry_point(\"tools\")\n", - " builder.add_conditional_edges(\"tools\", router)\n", - "\n", - " app = builder.compile()\n", - " chat_history = langchain_load_dump.dumpd(app.invoke(HumanMessage(input)))\n", - " return parse_messages_to_output_dictionary(chat_history)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_HGcs6PVjRj_" - }, - "source": [ - "### Test the agent\n", - "\n", - "Query your agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "lGb58OJkjUs9" - }, - "outputs": [], - "source": [ - "response = agent_parsed_outcome(input=\"Get product details for shoes\")\n", - "display(Markdown(format_output_as_markdown(response)))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2wCFstt8w4Dx" - }, - "outputs": [], - "source": [ - "response = agent_parsed_outcome(input=\"Get product price for shoes\")\n", - "display(Markdown(format_output_as_markdown(response)))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aOGPePsorpUl" - }, - "source": [ - "## Evaluating a LangGraph agent with Vertex AI Gen AI Evaluation\n", - "\n", - "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", - "\n", - "Monitoring focuses on how well your agent is performing specific tasks:\n", - "\n", - "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", - "\n", - "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", - "\n", - "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", - "\n", - "Observability is about understanding the overall health of the agent:\n", - "\n", - "* **Latency**: How long does it take the agent to respond?\n", - "\n", - "* **Failure Rate**: How often does the agent fail to produce a response?\n", - "\n", - "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e43229f3ad4f" - }, - "source": [ - "### Prepare Agent Evaluation dataset\n", - "\n", - "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", - "\n", - "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", - "\n", - "> Optionally, you can provide both generated responses and predicted trajectory (**Bring-Your-Own-Dataset scenario**).\n", - "\n", - "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fFf8uTdUiDt3" - }, - "outputs": [], - "source": [ - "eval_data = {\n", - " \"prompt\": [\n", - " \"Get price for smartphone\",\n", - " \"Get product details and price for headphones\",\n", - " \"Get details for usb charger\",\n", - " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\",\n", - " ],\n", - " \"reference_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - "}\n", - "\n", - "eval_sample_dataset = pd.DataFrame(eval_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PQEI1EcfvFHb" - }, - "source": [ - "Print some samples from the dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "EjsonqWWvIvE" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(eval_sample_dataset, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "m4CvBuf1afHG" - }, - "source": [ - "### Single tool usage evaluation\n", - "\n", - "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_rS5GGKHd5bx" - }, - "source": [ - "#### Set single tool usage metrics\n", - "\n", - "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", - "\n", - "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xixvq8dwd5by" - }, - "outputs": [], - "source": [ - "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ktKZoT2Qd5by" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using agent_parsed_outcome function and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SRv43fDcd5by" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"\n", - "\n", - "single_tool_call_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=single_tool_usage_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", - " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(single_tool_call_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6o5BjSTFKVMS" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Use some helper functions to visualize a sample of evaluation result." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1Jopzw83k14w" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JlujdJpu5Kn6" - }, - "source": [ - "### Trajectory Evaluation\n", - "\n", - "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8s-nHdDJneHM" - }, - "source": [ - "#### Set trajectory metrics\n", - "\n", - "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", - "\n", - "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", - "\n", - "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", - "\n", - "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", - "\n", - "* `trajectory_precision`: proportion of predicted actions present in reference\n", - "\n", - "* `trajectory_recall`: proportion of reference actions present in predicted. \n", - "\n", - "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "c32WIS95neHN" - }, - "outputs": [], - "source": [ - "trajectory_metrics = [\n", - " \"trajectory_exact_match\",\n", - " \"trajectory_in_order_match\",\n", - " \"trajectory_any_order_match\",\n", - " \"trajectory_precision\",\n", - " \"trajectory_recall\",\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF3jhTH3neHN" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vOdS7TJUneHN" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"trajectory-{get_id()}\"\n", - "\n", - "trajectory_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", - ")\n", - "\n", - "trajectory_eval_result = trajectory_eval_task.evaluate(\n", - " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(trajectory_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DBiUI3LyLBtj" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Print and visualize a sample of evaluation results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sLVRdN5llA0h" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "erYYZEaaTNjJ" - }, - "outputs": [], - "source": [ - "plot_bar_plot(\n", - " trajectory_eval_result,\n", - " title=\"Trajectory Metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "T8TipU2akHEd" - }, - "source": [ - "### Evaluate final response\n", - "\n", - "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DeK-py7ykkDN" - }, - "source": [ - "#### Set response metrics\n", - "\n", - "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", - "\n", - "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cyGHGgeVklvz" - }, - "outputs": [], - "source": [ - "response_metrics = [\"safety\", \"coherence\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DaBJWcg1kn55" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wRb2EC_hknSD" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"response-{get_id()}\"\n", - "\n", - "response_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", - ")\n", - "\n", - "response_eval_result = response_eval_task.evaluate(\n", - " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(response_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WOP9hW-rTUIU" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "\n", - "Print new evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZODTRuq2lF75" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ntRBK3Te6PEc" - }, - "source": [ - "### Evaluate generated response conditioned by tool choosing\n", - "\n", - "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", - "\n", - "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4bENwFcd6prX" - }, - "source": [ - "#### Define a custom metric\n", - "\n", - "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", - "\n", - "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "txGEHcg76riI" - }, - "outputs": [], - "source": [ - "criteria = {\n", - " \"Follows trajectory\": (\n", - " \"Evaluate whether the agent's response logically follows from the \"\n", - " \"sequence of actions it took. Consider these sub-points:\\n\"\n", - " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", - " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", - " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", - " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", - " )\n", - "}\n", - "\n", - "pointwise_rating_rubric = {\n", - " \"1\": \"Follows trajectory\",\n", - " \"0\": \"Does not follow trajectory\",\n", - "}\n", - "\n", - "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", - " criteria=criteria,\n", - " rating_rubric=pointwise_rating_rubric,\n", - " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8MJqXu0kikxd" - }, - "source": [ - "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5EL7iEDMikNQ" - }, - "outputs": [], - "source": [ - "print(response_follows_trajectory_prompt_template.prompt_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e1djVp7Fi4Yy" - }, - "source": [ - "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nx1xbZD87iMj" - }, - "outputs": [], - "source": [ - "response_follows_trajectory_metric = PointwiseMetric(\n", - " metric=\"response_follows_trajectory\",\n", - " metric_prompt_template=response_follows_trajectory_prompt_template,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1pmxLwTe7Ywv" - }, - "source": [ - "#### Set response metrics\n", - "\n", - "Set new generated response evaluation metrics by including the custom metric.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wrsbVFDd7Ywv" - }, - "outputs": [], - "source": [ - "response_tool_metrics = [\n", - " \"trajectory_exact_match\",\n", - " \"trajectory_in_order_match\",\n", - " \"safety\",\n", - " response_follows_trajectory_metric,\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Lo-Sza807Ywv" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "Run a new agent's evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_dkb4gSn7Ywv" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"response-over-tools-{get_id()}\"\n", - "\n", - "response_eval_tool_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "response_eval_tool_result = response_eval_tool_task.evaluate(\n", - " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(response_eval_tool_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AtOfIFi2j88g" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Visualize evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GH2YvXgLlLH7" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(response_eval_tool_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4nuUDP3a2eTB" - }, - "source": [ - "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", - "\n", - "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DRLKlmWd27PK" - }, - "source": [ - "### Bring your own evaluation dataset\n", - "\n", - "Define the evaluation dataset with the predicted trajectory and the generated response." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "y9hBgsg324Ej" - }, - "outputs": [], - "source": [ - "byod_eval_data = {\n", - " \"prompt\": [\n", - " \"Get price for smartphone\",\n", - " \"Get product details and price for headphones\",\n", - " \"Get details for usb charger\",\n", - " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\",\n", - " ],\n", - " \"reference_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - " \"generated_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - " \"response\": [\n", - " 500,\n", - " 50,\n", - " \"A super fast and light usb charger\",\n", - " 100,\n", - " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", - " ],\n", - "}\n", - "\n", - "byod_eval_sample_dataset = pd.DataFrame(eval_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oEYmU2eJ7q-1" - }, - "source": [ - "### Run an evaluation task\n", - "\n", - "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wBD-4wpB7q-3" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", - "\n", - "byod_response_eval_tool_task = EvalTask(\n", - " dataset=byod_eval_sample_dataset,\n", - " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", - " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN_NAME\n", - ")\n", - "\n", - "display_eval_report(byod_response_eval_tool_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9eU3LG6r7q-3" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Visualize evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pQFzmd2I7q-3" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "DJr8GqQKTpUa" - }, - "outputs": [], - "source": [ - "display_radar_plot(\n", - " byod_response_eval_tool_result,\n", - " title=\"Response Metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2a4e033321ad" - }, - "source": [ - "## Cleaning up\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Ox2I3UfRlTOd" - }, - "outputs": [], - "source": [ - "delete_experiment = True\n", - "\n", - "if delete_experiment:\n", - " try:\n", - " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", - " experiment.delete(delete_backing_tensorboard_runs=True)\n", - " except Exception as e:\n", - " print(e)" - ] - } - ], - "metadata": { - "colab": { - "name": "evaluating_langgraph_agent.ipynb", - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Evaluating Agents - Evaluate an LangGraph agent with Vertex AI Gen AI Evaluation\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Google
Open in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Open in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "Share to:\n", + "\n", + "\n", + " \"LinkedIn\n", + "\n", + "\n", + "\n", + " \"Bluesky\n", + "\n", + "\n", + "\n", + " \"X\n", + "\n", + "\n", + "\n", + " \"Reddit\n", + "\n", + "\n", + "\n", + " \"Facebook\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "84f0f73a0f76" + }, + "source": [ + "| | |\n", + "|-|-|\n", + "| Authors | [Ivan Nardini](https://github.com/inardini) [Naveksha Sood](https://github.com/navekshasood)|" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", + "\n", + "Vertex AI Model Evaluation provides a toolkit of quality controlled and explainable methods and metrics to evaluate any generative model or application, including agents, and benchmark the evaluation results against your own judgment, using your own evaluation criteria.\n", + "\n", + "This tutorial shows how to evaluate a LangGraph agent using Vertex AI Gen AI Evaluation for agent evaluation.\n", + "\n", + "The tutorial uses the following Google Cloud services and resources:\n", + "\n", + "* Vertex AI Gen AI Evaluation\n", + "\n", + "The steps performed include:\n", + "\n", + "* Build local agent using LangGraph\n", + "* Prepare Agent Evaluation dataset\n", + "* Single tool usage evaluation\n", + "* Trajectory evaluation\n", + "* Response evaluation\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "61RBz8LLbxCR" + }, + "source": [ + "## Get started" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "No17Cw5hgx12" + }, + "source": [ + "### Install Vertex AI SDK and other required packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tFy3H3aPgx12" + }, + "outputs": [], + "source": [ + "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation]\" \\\n", + " \"langchain_google_vertexai\" \\\n", + " \"langgraph\" \\\n", + " \"cloudpickle==3.0.0\" \\\n", + " \"pydantic==2.7.4\" \\\n", + " \"requests\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R5Xep4W9lq-Z" + }, + "source": [ + "### Restart runtime\n", + "\n", + "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", + "\n", + "The restart might take a minute or longer. After it's restarted, continue to the next step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XRvKdaPDTznN" + }, + "outputs": [], + "source": [ + "import IPython\n", + "\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SbmM4z7FOBpM" + }, + "source": [ + "
\n", + "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dmWOrTJ3gx13" + }, + "source": [ + "### Authenticate your notebook environment (Colab only)\n", + "\n", + "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NyKGtVQjgx13" + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " from google.colab import auth\n", + "\n", + " auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF4l8DTdWgPY" + }, + "source": [ + "### Set Google Cloud project information and initialize Vertex AI SDK\n", + "\n", + "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nqwi-5ufWp_B" + }, + "outputs": [], + "source": [ + "# Use the environment variable if the user doesn't provide Project ID.\n", + "import os\n", + "\n", + "import vertexai\n", + "\n", + "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", + "\n", + "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", + " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", + "\n", + "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", + "\n", + "EXPERIMENT_NAME = \"evaluate-langgraph-agent\" # @param {type:\"string\"}\n", + "\n", + "vertexai.init(project=PROJECT_ID, location=LOCATION, experiment=EXPERIMENT_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5303c05f7aa6" + }, + "source": [ + "## Import libraries\n", + "\n", + "Import tutorial libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6fc324893334" + }, + "outputs": [], + "source": [ + "# General\n", + "import random\n", + "import string\n", + "from typing import Literal\n", + "\n", + "from IPython.display import HTML, Markdown, display\n", + "\n", + "# Evaluate agent\n", + "from google.cloud import aiplatform\n", + "from langchain.load import dump as langchain_load_dump\n", + "\n", + "# Build agent\n", + "from langchain_core.messages import BaseMessage, HumanMessage\n", + "from langchain_core.tools import tool\n", + "from langchain_google_vertexai import ChatVertexAI\n", + "from langgraph.graph import END, MessageGraph\n", + "from langgraph.prebuilt import ToolNode\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "from vertexai.preview.evaluation import EvalTask\n", + "from vertexai.preview.evaluation.metrics import (\n", + " PointwiseMetric,\n", + " PointwiseMetricPromptTemplate,\n", + " TrajectorySingleToolUse,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MVnBDX54gz7j" + }, + "source": [ + "## Define helper functions\n", + "\n", + "Initiate a set of helper functions to print tutorial results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uSgWjMD_g1_v" + }, + "outputs": [], + "source": [ + "def get_id(length: int = 8) -> str:\n", + " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", + " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", + "\n", + "\n", + "def parse_messages_to_output_dictionary(messages: list[dict]) -> dict:\n", + " \"\"\"Parse response and function calls from a list of messages in the constructor format.\"\"\"\n", + "\n", + " final_output = {\n", + " \"response\": \"No AI response found in the message history.\",\n", + " \"predicted_trajectory\": [],\n", + " }\n", + "\n", + " # Process each message\n", + " function_calls = []\n", + " for message in messages:\n", + " # Check if it's a Tool message which contains the actual response\n", + " if message.get(\"type\") == \"constructor\" and \"ToolMessage\" in message.get(\n", + " \"id\", []\n", + " ):\n", + " final_output[\"response\"] = message[\"kwargs\"][\"content\"]\n", + "\n", + " # Check if it's an AI message to get tool calls\n", + " elif message.get(\"type\") == \"constructor\" and \"AIMessage\" in message.get(\n", + " \"id\", []\n", + " ):\n", + " tool_calls = message[\"kwargs\"].get(\"tool_calls\", [])\n", + " for tool_call in tool_calls:\n", + " if tool_call:\n", + " function_calls.append(\n", + " {\n", + " \"tool_name\": tool_call.get(\"name\"),\n", + " \"tool_input\": tool_call.get(\"args\"),\n", + " }\n", + " )\n", + "\n", + " final_output[\"predicted_trajectory\"] = function_calls\n", + " return final_output\n", + "\n", + "\n", + "def format_output_as_markdown(output: dict) -> str:\n", + " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", + " markdown = \"### AI Response\\n\"\n", + " markdown += f\"{output['response']}\\n\\n\"\n", + "\n", + " if output[\"predicted_trajectory\"]:\n", + " markdown += \"### Function Calls\\n\"\n", + " for call in output[\"predicted_trajectory\"]:\n", + " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", + " markdown += \" - **Arguments**:\\n\"\n", + " for key, value in call[\"tool_input\"].items():\n", + " markdown += f\" - `{key}`: `{value}`\\n\"\n", + "\n", + " return markdown\n", + "\n", + "\n", + "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", + " \"\"\"Display the evaluation results.\"\"\"\n", + " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", + " display(Markdown(\"### Summary Metrics\"))\n", + " display(metrics_df)\n", + "\n", + " display(Markdown(f\"### Row-wise Metrics\"))\n", + " display(eval_result.metrics_table)\n", + "\n", + "\n", + "def display_drilldown(row: pd.Series) -> None:\n", + " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", + "\n", + " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", + "\n", + " if not (\n", + " isinstance(row[\"predicted_trajectory\"], list)\n", + " and isinstance(row[\"reference_trajectory\"], list)\n", + " ):\n", + " return\n", + "\n", + " for predicted_trajectory, reference_trajectory in zip(\n", + " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", + " ):\n", + " display(\n", + " HTML(\n", + " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", + " )\n", + " )\n", + "\n", + " if not (\n", + " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", + " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", + " ):\n", + " continue\n", + "\n", + " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", + " print(\"Tool Input Key: \", tool_input_key)\n", + "\n", + " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " reference_trajectory[\"tool_input\"][tool_input_key],\n", + " )\n", + " else:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " \"N/A\",\n", + " )\n", + " print(\"\\n\")\n", + " display(HTML(\"
\"))\n", + "\n", + "\n", + "def display_dataframe_rows(\n", + " df: pd.DataFrame,\n", + " columns: list[str] | None = None,\n", + " num_rows: int = 3,\n", + " display_drilldown: bool = False,\n", + ") -> None:\n", + " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", + "\n", + " if columns:\n", + " df = df[columns]\n", + "\n", + " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", + " header_style = base_style + \"font-weight: bold;\"\n", + "\n", + " for _, row in df.head(num_rows).iterrows():\n", + " for column in df.columns:\n", + " display(\n", + " HTML(\n", + " f\"{column.replace('_', ' ').title()}: \"\n", + " )\n", + " )\n", + " display(HTML(f\"{row[column]}
\"))\n", + "\n", + " display(HTML(\"
\"))\n", + "\n", + " if (\n", + " display_drilldown\n", + " and \"predicted_trajectory\" in df.columns\n", + " and \"reference_trajectory\" in df.columns\n", + " ):\n", + " display_drilldown(row)\n", + "\n", + "\n", + "def plot_bar_plot(\n", + " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", + ") -> None:\n", + " fig = go.Figure()\n", + " data = []\n", + "\n", + " summary_metrics = eval_result.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " data.append(\n", + " go.Bar(\n", + " x=list(summary_metrics.keys()),\n", + " y=list(summary_metrics.values()),\n", + " name=title,\n", + " )\n", + " )\n", + "\n", + " fig = go.Figure(data=data)\n", + "\n", + " # Change the bar mode\n", + " fig.update_layout(barmode=\"group\")\n", + " fig.show()\n", + "\n", + "\n", + "def display_radar_plot(eval_results, title: str, metrics=None):\n", + " \"\"\"Plot the radar plot.\"\"\"\n", + " fig = go.Figure()\n", + " summary_metrics = eval_results.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " min_val = min(summary_metrics.values())\n", + " max_val = max(summary_metrics.values())\n", + "\n", + " fig.add_trace(\n", + " go.Scatterpolar(\n", + " r=list(summary_metrics.values()),\n", + " theta=list(summary_metrics.keys()),\n", + " fill=\"toself\",\n", + " name=title,\n", + " )\n", + " )\n", + " fig.update_layout(\n", + " title=title,\n", + " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", + " showlegend=True,\n", + " )\n", + " fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bDaa2Mtsifmq" + }, + "source": [ + "## Build LangGraph agent\n", + "\n", + "Build your application using LangGraph, including the Gemini model, custom tools that you define and a router to control the conversational flow." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KHwShhpOitKp" + }, + "source": [ + "### Set tools\n", + "\n", + "To start, set the tools that a customer support agent needs to do their job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gA2ZKvfeislw" + }, + "outputs": [], + "source": [ + "@tool\n", + "def get_product_details(product_name: str):\n", + " \"\"\"Gathers basic details about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", + " \"usb charger\": \"A super fast and light usb charger\",\n", + " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", + " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", + " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " }\n", + " return details.get(product_name, \"Product details not found.\")\n", + "\n", + "\n", + "@tool\n", + "def get_product_price(product_name: str):\n", + " \"\"\"Gathers price about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": 500,\n", + " \"usb charger\": 10,\n", + " \"shoes\": 100,\n", + " \"headphones\": 50,\n", + " \"speaker\": 80,\n", + " }\n", + " return details.get(product_name, \"Product price not found.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "be70714d9fae" + }, + "source": [ + "### Define router\n", + "\n", + "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "516b5108d327" + }, + "outputs": [], + "source": [ + "def router(\n", + " state: list[BaseMessage],\n", + ") -> Literal[\"get_product_details\", \"get_product_price\", \"__end__\"]:\n", + " \"\"\"Initiates product details or price retrieval if the user asks for a product.\"\"\"\n", + " # Get the tool_calls from the last message in the conversation history.\n", + " tool_calls = state[-1].tool_calls\n", + "\n", + " # If there are any tool_calls\n", + " if tool_calls:\n", + " # Check the function name in the first tool call\n", + " function_name = tool_calls[0].get(\"name\")\n", + " if function_name == \"get_product_price\":\n", + " return \"get_product_price\"\n", + " else:\n", + " return \"get_product_details\"\n", + " else:\n", + " # End the conversation flow.\n", + " return \"__end__\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l4mk5XPui4Y1" + }, + "source": [ + "### Set the model\n", + "\n", + "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BaYeo6K2i-w1" + }, + "outputs": [], + "source": [ + "model = \"gemini-1.5-pro\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNlAY9cojEWz" + }, + "source": [ + "### Assemble the agent\n", + "\n", + "The Vertex AI GenAI Evaluation works directly with 'Queryable' agents, and also lets you add your own custom functions with a specific structure (signature).\n", + "\n", + "In this case, you assemble the agent using a custom function. The function triggers the agent for a given input and parse the agent outcome to extract the response and called tools." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dAFdi7SujGP8" + }, + "outputs": [], + "source": [ + "def agent_parsed_outcome(input):\n", + "\n", + " model = ChatVertexAI(model=model)\n", + " builder = MessageGraph()\n", + "\n", + " model_with_tools = model.bind_tools([get_product_details, get_product_price])\n", + " builder.add_node(\"tools\", model_with_tools)\n", + "\n", + " tool_node = ToolNode([get_product_details, get_product_price])\n", + " builder.add_node(\"get_product_details\", tool_node)\n", + " builder.add_node(\"get_product_price\", tool_node)\n", + " builder.add_edge(\"get_product_details\", END)\n", + " builder.add_edge(\"get_product_price\", END)\n", + "\n", + " builder.set_entry_point(\"tools\")\n", + " builder.add_conditional_edges(\"tools\", router)\n", + "\n", + " app = builder.compile()\n", + " chat_history = langchain_load_dump.dumpd(app.invoke(HumanMessage(input)))\n", + " return parse_messages_to_output_dictionary(chat_history)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_HGcs6PVjRj_" + }, + "source": [ + "### Test the agent\n", + "\n", + "Query your agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lGb58OJkjUs9" + }, + "outputs": [], + "source": [ + "response = agent_parsed_outcome(input=\"Get product details for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2wCFstt8w4Dx" + }, + "outputs": [], + "source": [ + "response = agent_parsed_outcome(input=\"Get product price for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aOGPePsorpUl" + }, + "source": [ + "## Evaluating a LangGraph agent with Vertex AI Gen AI Evaluation\n", + "\n", + "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", + "\n", + "Monitoring focuses on how well your agent is performing specific tasks:\n", + "\n", + "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", + "\n", + "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", + "\n", + "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", + "\n", + "Observability is about understanding the overall health of the agent:\n", + "\n", + "* **Latency**: How long does it take the agent to respond?\n", + "\n", + "* **Failure Rate**: How often does the agent fail to produce a response?\n", + "\n", + "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e43229f3ad4f" + }, + "source": [ + "### Prepare Agent Evaluation dataset\n", + "\n", + "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", + "\n", + "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", + "\n", + "> Optionally, you can provide both generated responses and predicted trajectory (**Bring-Your-Own-Dataset scenario**).\n", + "\n", + "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fFf8uTdUiDt3" + }, + "outputs": [], + "source": [ + "eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + "}\n", + "\n", + "eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PQEI1EcfvFHb" + }, + "source": [ + "Print some samples from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EjsonqWWvIvE" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(eval_sample_dataset, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m4CvBuf1afHG" + }, + "source": [ + "### Single tool usage evaluation\n", + "\n", + "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_rS5GGKHd5bx" + }, + "source": [ + "#### Set single tool usage metrics\n", + "\n", + "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", + "\n", + "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xixvq8dwd5by" + }, + "outputs": [], + "source": [ + "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ktKZoT2Qd5by" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using agent_parsed_outcome function and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SRv43fDcd5by" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"\n", + "\n", + "single_tool_call_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=single_tool_usage_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(single_tool_call_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6o5BjSTFKVMS" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Use some helper functions to visualize a sample of evaluation result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1Jopzw83k14w" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JlujdJpu5Kn6" + }, + "source": [ + "### Trajectory Evaluation\n", + "\n", + "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8s-nHdDJneHM" + }, + "source": [ + "#### Set trajectory metrics\n", + "\n", + "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", + "\n", + "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", + "\n", + "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", + "\n", + "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", + "\n", + "* `trajectory_precision`: proportion of predicted actions present in reference\n", + "\n", + "* `trajectory_recall`: proportion of reference actions present in predicted. \n", + "\n", + "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c32WIS95neHN" + }, + "outputs": [], + "source": [ + "trajectory_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"trajectory_any_order_match\",\n", + " \"trajectory_precision\",\n", + " \"trajectory_recall\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF3jhTH3neHN" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vOdS7TJUneHN" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"trajectory-{get_id()}\"\n", + "\n", + "trajectory_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "trajectory_eval_result = trajectory_eval_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(trajectory_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DBiUI3LyLBtj" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Print and visualize a sample of evaluation results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sLVRdN5llA0h" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "erYYZEaaTNjJ" + }, + "outputs": [], + "source": [ + "plot_bar_plot(\n", + " trajectory_eval_result,\n", + " title=\"Trajectory Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T8TipU2akHEd" + }, + "source": [ + "### Evaluate final response\n", + "\n", + "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DeK-py7ykkDN" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", + "\n", + "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cyGHGgeVklvz" + }, + "outputs": [], + "source": [ + "response_metrics = [\"safety\", \"coherence\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DaBJWcg1kn55" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wRb2EC_hknSD" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"response-{get_id()}\"\n", + "\n", + "response_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "response_eval_result = response_eval_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(response_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WOP9hW-rTUIU" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZODTRuq2lF75" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ntRBK3Te6PEc" + }, + "source": [ + "### Evaluate generated response conditioned by tool choosing\n", + "\n", + "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", + "\n", + "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4bENwFcd6prX" + }, + "source": [ + "#### Define a custom metric\n", + "\n", + "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", + "\n", + "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "txGEHcg76riI" + }, + "outputs": [], + "source": [ + "criteria = {\n", + " \"Follows trajectory\": (\n", + " \"Evaluate whether the agent's response logically follows from the \"\n", + " \"sequence of actions it took. Consider these sub-points:\\n\"\n", + " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", + " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", + " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", + " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", + " )\n", + "}\n", + "\n", + "pointwise_rating_rubric = {\n", + " \"1\": \"Follows trajectory\",\n", + " \"0\": \"Does not follow trajectory\",\n", + "}\n", + "\n", + "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", + " criteria=criteria,\n", + " rating_rubric=pointwise_rating_rubric,\n", + " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8MJqXu0kikxd" + }, + "source": [ + "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5EL7iEDMikNQ" + }, + "outputs": [], + "source": [ + "print(response_follows_trajectory_prompt_template.prompt_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e1djVp7Fi4Yy" + }, + "source": [ + "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nx1xbZD87iMj" + }, + "outputs": [], + "source": [ + "response_follows_trajectory_metric = PointwiseMetric(\n", + " metric=\"response_follows_trajectory\",\n", + " metric_prompt_template=response_follows_trajectory_prompt_template,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1pmxLwTe7Ywv" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "Set new generated response evaluation metrics by including the custom metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wrsbVFDd7Ywv" + }, + "outputs": [], + "source": [ + "response_tool_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"safety\",\n", + " response_follows_trajectory_metric,\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lo-Sza807Ywv" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_dkb4gSn7Ywv" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"response-over-tools-{get_id()}\"\n", + "\n", + "response_eval_tool_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "response_eval_tool_result = response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AtOfIFi2j88g" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GH2YvXgLlLH7" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4nuUDP3a2eTB" + }, + "source": [ + "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", + "\n", + "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DRLKlmWd27PK" + }, + "source": [ + "### Bring your own evaluation dataset\n", + "\n", + "Define the evaluation dataset with the predicted trajectory and the generated response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y9hBgsg324Ej" + }, + "outputs": [], + "source": [ + "byod_eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"generated_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"response\": [\n", + " 500,\n", + " 50,\n", + " \"A super fast and light usb charger\",\n", + " 100,\n", + " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " ],\n", + "}\n", + "\n", + "byod_eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oEYmU2eJ7q-1" + }, + "source": [ + "### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wBD-4wpB7q-3" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", + "\n", + "byod_response_eval_tool_task = EvalTask(\n", + " dataset=byod_eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN_NAME\n", + ")\n", + "\n", + "display_eval_report(byod_response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9eU3LG6r7q-3" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pQFzmd2I7q-3" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DJr8GqQKTpUa" + }, + "outputs": [], + "source": [ + "display_radar_plot(\n", + " byod_response_eval_tool_result,\n", + " title=\"Response Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2a4e033321ad" + }, + "source": [ + "## Cleaning up\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ox2I3UfRlTOd" + }, + "outputs": [], + "source": [ + "delete_experiment = True\n", + "\n", + "if delete_experiment:\n", + " try:\n", + " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", + " experiment.delete(delete_backing_tensorboard_runs=True)\n", + " except Exception as e:\n", + " print(e)" + ] + } + ], + "metadata": { + "colab": { + "name": "evaluating_langgraph_agent.ipynb", + "toc_visible": true + }, + "environment": { + "kernel": "python3", + "name": "common-cpu.m126", + "type": "gcloud", + "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/base-cpu:m126" + }, + "kernelspec": { + "display_name": "Python 3 (Local)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } diff --git a/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb b/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb index 10be4b26fb..645877010d 100644 --- a/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb +++ b/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb @@ -1,1696 +1,1715 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ur8xi4C7S06n" - }, - "outputs": [], - "source": [ - "# Copyright 2024 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JAPoU8Sm5E6e" - }, - "source": [ - "# Evaluate an CrewAI agent on Vertex AI Reasoning Engine (Customized template)\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \"Google
Open in Colab\n", - "
\n", - "
\n", - " \n", - " \"Google
Open in Colab Enterprise\n", - "
\n", - "
\n", - " \n", - " \"Vertex
Open in Vertex AI Workbench\n", - "
\n", - "
\n", - " \n", - " \"GitHub
View on GitHub\n", - "
\n", - "
\n", - "\n", - "
\n", - "\n", - "Share to:\n", - "\n", - "\n", - " \"LinkedIn\n", - "\n", - "\n", - "\n", - " \"Bluesky\n", - "\n", - "\n", - "\n", - " \"X\n", - "\n", - "\n", - "\n", - " \"Reddit\n", - "\n", - "\n", - "\n", - " \"Facebook\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "84f0f73a0f76" - }, - "source": [ - "| | |\n", - "|-|-|\n", - "| Authors | [Naveksha Sood](https://github.com/navekshasood) [Ivan Nardini](https://github.com/inardini) |" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tvgnzT1CKxrO" - }, - "source": [ - "## Overview\n", - "\n", - "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", - "\n", - "This tutorial shows how to evaluate an CrewAI agent with customized template on Vertex AI Reasoning Engine using Vertex AI Gen AI Evaluation.\n", - "\n", - "The tutorial uses the following Google Cloud services and resources:\n", - "\n", - "* Vertex AI Gen AI Evaluation\n", - "* Vertex AI Reasoning Engine\n", - "\n", - "The steps performed include:\n", - "\n", - "* Build and deploy an agent using CrewAI on Vertex AI Reasoning Engine\n", - "* Prepare Agent Evaluation dataset\n", - "* Single tool usage evaluation\n", - "* Trajectory evaluation\n", - "* Response evaluation\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "61RBz8LLbxCR" - }, - "source": [ - "## Get started" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "No17Cw5hgx12" - }, - "source": [ - "### Install Vertex AI SDK and other required packages\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tFy3H3aPgx12" - }, - "outputs": [], - "source": [ - "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation, reasoningengine]\" \\\n", - " \"crewai\" \"crewai-tools\" \\\n", - " \"cloudpickle==3.0.0\" \\\n", - " \"pydantic==2.7.4\" \\\n", - " \"requests\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "R5Xep4W9lq-Z" - }, - "source": [ - "### Restart runtime\n", - "\n", - "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", - "\n", - "The restart might take a minute or longer. After it's restarted, continue to the next step." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XRvKdaPDTznN" - }, - "outputs": [], - "source": [ - "import IPython\n", - "\n", - "app = IPython.Application.instance()\n", - "app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SbmM4z7FOBpM" - }, - "source": [ - "
\n", - "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", - "
\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dmWOrTJ3gx13" - }, - "source": [ - "### Authenticate your notebook environment (Colab only)\n", - "\n", - "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NyKGtVQjgx13" - }, - "outputs": [], - "source": [ - "import sys\n", - "\n", - "if \"google.colab\" in sys.modules:\n", - " from google.colab import auth\n", - "\n", - " auth.authenticate_user()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF4l8DTdWgPY" - }, - "source": [ - "### Set Google Cloud project information and initialize Vertex AI SDK\n", - "\n", - "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", - "\n", - "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nqwi-5ufWp_B" - }, - "outputs": [], - "source": [ - "# Use the environment variable if the user doesn't provide Project ID.\n", - "import os\n", - "\n", - "import vertexai\n", - "\n", - "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", - "\n", - "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", - " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", - "\n", - "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", - "\n", - "BUCKET_NAME = \"[your-bucket-name]\" # @param {type: \"string\", placeholder: \"[your-bucket-name]\", isTemplate: true}\n", - "\n", - "if not BUCKET_NAME or BUCKET_NAME == \"[your-bucket-name]\":\n", - " BUCKET_NAME = f\"{PROJECT_ID}-bucket\"\n", - "\n", - "BUCKET_URI = f\"gs://evaluate_agents\"\n", - "\n", - "! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI\n", - "\n", - "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", - "\n", - "vertexai.init(\n", - " project=PROJECT_ID,\n", - " location=LOCATION,\n", - " staging_bucket=BUCKET_URI,\n", - " experiment=EXPERIMENT_NAME,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5303c05f7aa6" - }, - "source": [ - "## Import libraries\n", - "\n", - "Import tutorial libraries." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6fc324893334" - }, - "outputs": [], - "source": [ - "# General\n", - "import random\n", - "import string\n", - "\n", - "from IPython.display import HTML, Markdown, display\n", - "\n", - "# Build agent\n", - "from crewai import Agent, Crew, Process, Task\n", - "from crewai.flow.flow import Flow, listen, start\n", - "from crewai_tools import tool\n", - "\n", - "# Evaluate agent\n", - "from google.cloud import aiplatform\n", - "import pandas as pd\n", - "import plotly.graph_objects as go\n", - "from vertexai.preview import reasoning_engines\n", - "from vertexai.preview.evaluation import EvalTask\n", - "from vertexai.preview.evaluation.metrics import (\n", - " PointwiseMetric,\n", - " PointwiseMetricPromptTemplate,\n", - " TrajectorySingleToolUse,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MVnBDX54gz7j" - }, - "source": [ - "## Define helper functions\n", - "\n", - "Initiate a set of helper functions to print tutorial results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uSgWjMD_g1_v" - }, - "outputs": [], - "source": [ - "def get_id(length: int = 8) -> str:\n", - " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", - " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", - "\n", - "\n", - "def parse_crewai_output_to_dictionary(crew, crew_output):\n", - " \"\"\"\n", - " Parse CrewAI output into a structured dictionary format.\n", - " \"\"\"\n", - " final_output = {\"response\": str(crew_output), \"predicted_trajectory\": []}\n", - "\n", - " try:\n", - " # Access tools_results directly from each agent\n", - " for agent in crew.agents:\n", - " if hasattr(agent, \"tools_results\"):\n", - " for tool_result in agent.tools_results:\n", - " tool_info = {\n", - " \"tool_name\": tool_result.get(\"tool_name\", \"\"),\n", - " \"tool_input\": tool_result.get(\"tool_args\", {}),\n", - " }\n", - " final_output[\"predicted_trajectory\"].append(tool_info)\n", - "\n", - " except Exception as e:\n", - " final_output[\"error\"] = f\"Error parsing tools results: {str(e)}\"\n", - "\n", - " return final_output\n", - "\n", - "\n", - "def format_output_as_markdown(output: dict) -> str:\n", - " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", - " markdown = \"### AI Response\\n\"\n", - " markdown += f\"{output['response']}\\n\\n\"\n", - "\n", - " if output[\"predicted_trajectory\"]:\n", - " markdown += \"### Function Calls\\n\"\n", - " for call in output[\"predicted_trajectory\"]:\n", - " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", - " markdown += \" - **Arguments**:\\n\"\n", - " for key, value in call[\"tool_input\"].items():\n", - " markdown += f\" - `{key}`: `{value}`\\n\"\n", - "\n", - " return markdown\n", - "\n", - "\n", - "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", - " \"\"\"Display the evaluation results.\"\"\"\n", - " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", - " display(Markdown(\"### Summary Metrics\"))\n", - " display(metrics_df)\n", - "\n", - " display(Markdown(f\"### Row-wise Metrics\"))\n", - " display(eval_result.metrics_table)\n", - "\n", - "\n", - "def display_drilldown(row: pd.Series) -> None:\n", - " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", - "\n", - " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", - "\n", - " if not (\n", - " isinstance(row[\"predicted_trajectory\"], list)\n", - " and isinstance(row[\"reference_trajectory\"], list)\n", - " ):\n", - " return\n", - "\n", - " for predicted_trajectory, reference_trajectory in zip(\n", - " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", - " ):\n", - " display(\n", - " HTML(\n", - " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", - " )\n", - " )\n", - "\n", - " if not (\n", - " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", - " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", - " ):\n", - " continue\n", - "\n", - " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", - " print(\"Tool Input Key: \", tool_input_key)\n", - "\n", - " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", - " print(\n", - " \"Tool Values: \",\n", - " predicted_trajectory[\"tool_input\"][tool_input_key],\n", - " reference_trajectory[\"tool_input\"][tool_input_key],\n", - " )\n", - " else:\n", - " print(\n", - " \"Tool Values: \",\n", - " predicted_trajectory[\"tool_input\"][tool_input_key],\n", - " \"N/A\",\n", - " )\n", - " print(\"\\n\")\n", - " display(HTML(\"
\"))\n", - "\n", - "\n", - "def display_dataframe_rows(\n", - " df: pd.DataFrame,\n", - " columns: list[str] | None = None,\n", - " num_rows: int = 3,\n", - " display_drilldown: bool = False,\n", - ") -> None:\n", - " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", - "\n", - " if columns:\n", - " df = df[columns]\n", - "\n", - " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", - " header_style = base_style + \"font-weight: bold;\"\n", - "\n", - " for _, row in df.head(num_rows).iterrows():\n", - " for column in df.columns:\n", - " display(\n", - " HTML(\n", - " f\"{column.replace('_', ' ').title()}: \"\n", - " )\n", - " )\n", - " display(HTML(f\"{row[column]}
\"))\n", - "\n", - " display(HTML(\"
\"))\n", - "\n", - " if (\n", - " display_drilldown\n", - " and \"predicted_trajectory\" in df.columns\n", - " and \"reference_trajectory\" in df.columns\n", - " ):\n", - " display_drilldown(row)\n", - "\n", - "\n", - "def plot_bar_plot(\n", - " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", - ") -> None:\n", - " fig = go.Figure()\n", - " data = []\n", - "\n", - " summary_metrics = eval_result.summary_metrics\n", - " if metrics:\n", - " summary_metrics = {\n", - " k: summary_metrics[k]\n", - " for k, v in summary_metrics.items()\n", - " if any(selected_metric in k for selected_metric in metrics)\n", - " }\n", - "\n", - " data.append(\n", - " go.Bar(\n", - " x=list(summary_metrics.keys()),\n", - " y=list(summary_metrics.values()),\n", - " name=title,\n", - " )\n", - " )\n", - "\n", - " fig = go.Figure(data=data)\n", - "\n", - " # Change the bar mode\n", - " fig.update_layout(barmode=\"group\")\n", - " fig.show()\n", - "\n", - "\n", - "def display_radar_plot(eval_results, title: str, metrics=None):\n", - " \"\"\"Plot the radar plot.\"\"\"\n", - " fig = go.Figure()\n", - " summary_metrics = eval_results.summary_metrics\n", - " if metrics:\n", - " summary_metrics = {\n", - " k: summary_metrics[k]\n", - " for k, v in summary_metrics.items()\n", - " if any(selected_metric in k for selected_metric in metrics)\n", - " }\n", - "\n", - " min_val = min(summary_metrics.values())\n", - " max_val = max(summary_metrics.values())\n", - "\n", - " fig.add_trace(\n", - " go.Scatterpolar(\n", - " r=list(summary_metrics.values()),\n", - " theta=list(summary_metrics.keys()),\n", - " fill=\"toself\",\n", - " name=title,\n", - " )\n", - " )\n", - " fig.update_layout(\n", - " title=title,\n", - " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", - " showlegend=True,\n", - " )\n", - " fig.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bDaa2Mtsifmq" - }, - "source": [ - "## Build an agent using Vertex AI Reasoning Engine's customized template\n", - "\n", - "Build your application using CrewAI, including the Gemini model and custom tools that you define.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KHwShhpOitKp" - }, - "source": [ - "### Set tools\n", - "\n", - "To start, set the tools that a customer support agent needs to do their job." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gA2ZKvfeislw" - }, - "outputs": [], - "source": [ - "@tool\n", - "def get_product_details(product_name: str):\n", - " \"\"\"Gathers basic details about a product.\"\"\"\n", - " details = {\n", - " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", - " \"usb charger\": \"A super fast and light usb charger\",\n", - " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", - " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", - " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", - " }\n", - " return details.get(product_name, \"Product details not found.\")\n", - "\n", - "\n", - "@tool\n", - "def get_product_price(product_name: str):\n", - " \"\"\"Gathers price about a product.\"\"\"\n", - " details = {\n", - " \"smartphone\": 500,\n", - " \"usb charger\": 10,\n", - " \"shoes\": 100,\n", - " \"headphones\": 50,\n", - " \"speaker\": 80,\n", - " }\n", - " return details.get(product_name, \"Product price not found.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "be70714d9fae" - }, - "source": [ - "### Define router using Flow\n", - "\n", - "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "516b5108d327" - }, - "outputs": [], - "source": [ - "class ProductFlow(Flow):\n", - " @start\n", - " def begin_flow(self):\n", - " \"\"\"Starts the product information flow\"\"\"\n", - " return \"check_request\"\n", - "\n", - " @listen(\"check_request\")\n", - " def router(self, state: dict) -> str:\n", - " \"\"\"Routes the product request to appropriate handler\"\"\"\n", - " # Get the last message from the state\n", - " last_message = state.get(\"last_message\", {})\n", - " tool_calls = last_message.get(\"tool_calls\", [])\n", - "\n", - " if tool_calls:\n", - " function_name = tool_calls[0].get(\"name\")\n", - " if function_name == \"get_product_price\":\n", - " return \"get_product_price\"\n", - " else:\n", - " return \"get_product_details\"\n", - " return \"end\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FHjhBVx2cHWb" - }, - "source": [ - "### Set the model\n", - "\n", - "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "iCx9hbpccHWc" - }, - "outputs": [], - "source": [ - "model = \"vertex_ai/gemini-1.5-pro-002\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tNlAY9cojEWz" - }, - "source": [ - "### Assemble the agent\n", - "\n", - "To create a CrewAI agent using [Vertex AI Reasoning Engine](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy), use the [customized template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize).\n", - "\n", - "This class helps you quickly get an agent using any framework running Vertex AI Reasoning Engine.\n", - "\n", - "To learn more about the template, check out [Customize an application template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize) documentation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dAFdi7SujGP8" - }, - "outputs": [], - "source": [ - "class CrewAIApp:\n", - " def __init__(self, project: str, location: str, model: str = model) -> None:\n", - " self.project_id = project\n", - " self.location = location\n", - " self.model = model\n", - "\n", - " # The set_up method is used to define application initialization logic\n", - " def set_up(self) -> None:\n", - " \"\"\"Set up the application.\"\"\"\n", - " os.environ[\"GOOGLE_CLOUD_PROJECT\"] = self.project_id\n", - " return\n", - "\n", - " # The query method will be used to send inputs to the agent\n", - " def query(self, input: str):\n", - " \"\"\"Query the application.\"\"\"\n", - " product_researcher = Agent(\n", - " role=\"Product Researcher\",\n", - " goal=\"Research product details and prices accurately\",\n", - " backstory=\"Expert at gathering and analyzing product information\",\n", - " llm=model,\n", - " tools=[get_product_details, get_product_price],\n", - " allow_delegation=False,\n", - " )\n", - "\n", - " research_task = Task(\n", - " description=f\"Analyze this user request: '{input}'. \"\n", - " f\"If the request is about price, use get_product_price tool. \"\n", - " f\"Otherwise, use get_product_details tool to get product information.\",\n", - " expected_output=\"Product information including details and/or price based on the user request.\",\n", - " agent=product_researcher,\n", - " )\n", - "\n", - " crew = Crew(\n", - " agents=[product_researcher],\n", - " tasks=[research_task],\n", - " process=Process.sequential,\n", - " )\n", - "\n", - " result = crew.kickoff()\n", - " return parse_crewai_output_to_dictionary(crew, result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_HGcs6PVjRj_" - }, - "source": [ - "### Test the local agent\n", - "\n", - "Query your agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1dXLLgBudu_L" - }, - "outputs": [], - "source": [ - "local_custom_agent = CrewAIApp(project=PROJECT_ID, location=LOCATION)\n", - "local_custom_agent.set_up()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PgkOhPmN3aCZ" - }, - "outputs": [], - "source": [ - "response = local_custom_agent.query(input=\"Get product details for shoes\")\n", - "display(Markdown(format_output_as_markdown(response)))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "lGb58OJkjUs9" - }, - "outputs": [], - "source": [ - "response = local_custom_agent.query(input=\"Get product price for shoes\")\n", - "display(Markdown(format_output_as_markdown(response)))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2pSItXD5e4QD" - }, - "source": [ - "### Deploy the local agent to Vertex AI Reasoning Engine\n", - "\n", - "To deploy the local agent on Vertex AI Reasoning Engine, you can use the `create` method by passing the agent and some specify dependencies (`requirements` for external PyPI packages and `extra_packages` for local packages ).\n", - "\n", - "Look at [Deploy the application](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy#create_a_reasoningengine_instance) documentation page to learn more. \n", - "\n", - "> The agent deployment on Vertex AI Reasoning Engine would require ~ 10 mins." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "3HLz_a1We4QE" - }, - "outputs": [], - "source": [ - "local_custom_agent = CrewAIApp(project=PROJECT_ID, location=LOCATION)\n", - "\n", - "remote_custom_agent = reasoning_engines.ReasoningEngine.create(\n", - " local_custom_agent,\n", - " requirements=[\n", - " \"google-cloud-aiplatform[reasoningengine]\",\n", - " \"crewai\",\n", - " \"crewai-tools\",\n", - " \"cloudpickle==3.0.0\",\n", - " \"pydantic==2.7.4\",\n", - " \"requests\",\n", - " ],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nu4RO1P9e4QE" - }, - "source": [ - "### Test the remote agent\n", - "\n", - "Query your remote agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sqBtzYyce4QE" - }, - "outputs": [], - "source": [ - "response = remote_custom_agent.query(input=\"Get product details for shoes\")\n", - "display(Markdown(format_output_as_markdown(response)))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aOGPePsorpUl" - }, - "source": [ - "## Evaluating a custom agent with Vertex AI Gen AI Evaluation\n", - "\n", - "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", - "\n", - "Monitoring focuses on how well your agent is performing specific tasks:\n", - "\n", - "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", - "\n", - "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", - "\n", - "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", - "\n", - "Observability is about understanding the overall health of the agent:\n", - "\n", - "* **Latency**: How long does it take the agent to respond?\n", - "\n", - "* **Failure Rate**: How often does the agent fail to produce a response?\n", - "\n", - "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e43229f3ad4f" - }, - "source": [ - "### Prepare Agent Evaluation dataset\n", - "\n", - "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", - "\n", - "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", - "\n", - "\n", - "> Optionally, you can provide both generated responses and predicted trajectory (**bring-your-own-dataset scenario**).\n", - "\n", - "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fFf8uTdUiDt3" - }, - "outputs": [], - "source": [ - "eval_data = {\n", - " \"prompt\": [\n", - " \"Get price for smartphone\",\n", - " \"Get product details and price for headphones\",\n", - " \"Get details for usb charger\",\n", - " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\",\n", - " ],\n", - " \"reference_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - "}\n", - "\n", - "eval_sample_dataset = pd.DataFrame(eval_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PQEI1EcfvFHb" - }, - "source": [ - "Print some samples from the dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "EjsonqWWvIvE" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(eval_sample_dataset, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "htCrOS9fRVi8" - }, - "source": [ - "### Prepare an Agent function\n", - "\n", - "In this scenario with a custom agent, you need an agent function to pass the agent output to Vertex AI Gen AI Evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GdO56MIDRZri" - }, - "outputs": [], - "source": [ - "def agent_parsed_response(input: str) -> dict:\n", - " \"\"\"Pass the agent output to Vertex AI Gen AI Evaluation.\"\"\"\n", - "\n", - " result = remote_custom_agent.query(input=input)\n", - "\n", - " return result" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "m4CvBuf1afHG" - }, - "source": [ - "### Single tool usage evaluation\n", - "\n", - "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_rS5GGKHd5bx" - }, - "source": [ - "#### Set single tool usage metrics\n", - "\n", - "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", - "\n", - "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xixvq8dwd5by" - }, - "outputs": [], - "source": [ - "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ktKZoT2Qd5by" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using a local agent (local_1p_agent) and assigns a unique identifier to this specific evaluation run, storing the evaluation results.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QaMf9dqzySE6" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SRv43fDcd5by" - }, - "outputs": [], - "source": [ - "single_tool_call_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=single_tool_usage_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", - " runnable=agent_parsed_response, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(single_tool_call_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6o5BjSTFKVMS" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Use some helper functions to visualize a sample of evaluation result." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZkpwPReipekr" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JlujdJpu5Kn6" - }, - "source": [ - "### Trajectory Evaluation\n", - "\n", - "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8s-nHdDJneHM" - }, - "source": [ - "#### Set trajectory metrics\n", - "\n", - "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", - "\n", - "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", - "\n", - "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", - "\n", - "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", - "\n", - "* `trajectory_precision`: proportion of predicted actions present in reference\n", - "\n", - "* `trajectory_recall`: proportion of reference actions present in predicted. \n", - "\n", - "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "c32WIS95neHN" - }, - "outputs": [], - "source": [ - "trajectory_metrics = [\n", - " \"trajectory_exact_match\",\n", - " \"trajectory_in_order_match\",\n", - " \"trajectory_any_order_match\",\n", - " \"trajectory_precision\",\n", - " \"trajectory_recall\",\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF3jhTH3neHN" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vOdS7TJUneHN" - }, - "outputs": [], - "source": [ - "EXPERIMENT_NAME = \"evaluate-re-agent-trajectory\" # @param {type:\"string\"}\n", - "\n", - "trajectory_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", - ")\n", - "\n", - "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=agent_parsed_response)\n", - "\n", - "display_eval_report(trajectory_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DBiUI3LyLBtj" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Print and visualize a sample of evaluation results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sLVRdN5llA0h" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PrxM5sMZYXHP" - }, - "outputs": [], - "source": [ - "plot_bar_plot(\n", - " trajectory_eval_result,\n", - " title=\"Trajectory Metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "T8TipU2akHEd" - }, - "source": [ - "### Evaluate final response\n", - "\n", - "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DeK-py7ykkDN" - }, - "source": [ - "#### Set response metrics\n", - "\n", - "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", - "\n", - "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cyGHGgeVklvz" - }, - "outputs": [], - "source": [ - "response_metrics = [\"safety\", \"coherence\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DaBJWcg1kn55" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wRb2EC_hknSD" - }, - "outputs": [], - "source": [ - "EXPERIMENT_NAME = \"evaluate-re-agent-response\" # @param {type:\"string\"}\n", - "\n", - "response_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", - ")\n", - "\n", - "response_eval_result = response_eval_task.evaluate(runnable=agent_parsed_response)\n", - "\n", - "display_eval_report(response_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JtewTwiwg9qH" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "\n", - "Print new evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cy0aRydrp9zW" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ntRBK3Te6PEc" - }, - "source": [ - "### Evaluate generated response conditioned by tool choosing\n", - "\n", - "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", - "\n", - "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4bENwFcd6prX" - }, - "source": [ - "#### Define a custom metric\n", - "\n", - "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", - "\n", - "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "txGEHcg76riI" - }, - "outputs": [], - "source": [ - "criteria = {\n", - " \"Follows trajectory\": (\n", - " \"Evaluate whether the agent's response logically follows from the \"\n", - " \"sequence of actions it took. Consider these sub-points:\\n\"\n", - " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", - " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", - " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", - " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", - " )\n", - "}\n", - "\n", - "pointwise_rating_rubric = {\n", - " \"1\": \"Follows trajectory\",\n", - " \"0\": \"Does not follow trajectory\",\n", - "}\n", - "\n", - "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", - " criteria=criteria,\n", - " rating_rubric=pointwise_rating_rubric,\n", - " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8MJqXu0kikxd" - }, - "source": [ - "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5EL7iEDMikNQ" - }, - "outputs": [], - "source": [ - "print(response_follows_trajectory_prompt_template.prompt_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e1djVp7Fi4Yy" - }, - "source": [ - "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nx1xbZD87iMj" - }, - "outputs": [], - "source": [ - "response_follows_trajectory_metric = PointwiseMetric(\n", - " metric=\"response_follows_trajectory\",\n", - " metric_prompt_template=response_follows_trajectory_prompt_template,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1pmxLwTe7Ywv" - }, - "source": [ - "#### Set response metrics\n", - "\n", - "Set new generated response evaluation metrics by including the custom metric.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wrsbVFDd7Ywv" - }, - "outputs": [], - "source": [ - "response_tool_metrics = [\n", - " \"trajectory_exact_match\",\n", - " \"trajectory_in_order_match\",\n", - " \"safety\",\n", - " response_follows_trajectory_metric,\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Lo-Sza807Ywv" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "Run a new agent's evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_dkb4gSn7Ywv" - }, - "outputs": [], - "source": [ - "EXPERIMENT_NAME = \"evaluate-re-agent-response-by-tools\" # @param {type:\"string\"}\n", - "\n", - "response_eval_tool_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "response_eval_tool_result = response_eval_tool_task.evaluate(\n", - " runnable=agent_parsed_response\n", - ")\n", - "\n", - "display_eval_report(response_eval_tool_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EFmnRBlWqJnC" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "\n", - "Print new evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZODTRuq2lF75" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4nuUDP3a2eTB" - }, - "source": [ - "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", - "\n", - "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pNh3-NDuZGDl" - }, - "source": [ - "### Bring your own evaluation dataset\n", - "\n", - "Define the evaluation dataset with the predicted trajectory and the generated response." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "y9hBgsg324Ej" - }, - "outputs": [], - "source": [ - "byod_eval_data = {\n", - " \"prompt\": [\n", - " \"Get price for smartphone\",\n", - " \"Get product details and price for headphones\",\n", - " \"Get details for usb charger\",\n", - " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\",\n", - " ],\n", - " \"reference_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - " \"generated_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - " \"response\": [\n", - " 500,\n", - " 50,\n", - " \"A super fast and light usb charger\",\n", - " 100,\n", - " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", - " ],\n", - "}\n", - "\n", - "byod_eval_sample_dataset = pd.DataFrame(eval_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oEYmU2eJ7q-1" - }, - "source": [ - "### Run an evaluation task\n", - "\n", - "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wBD-4wpB7q-3" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", - "\n", - "byod_response_eval_tool_task = EvalTask(\n", - " dataset=byod_eval_sample_dataset,\n", - " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", - " runnable=agent_parsed_response, experiment_run_name=EXPERIMENT_RUN_NAME\n", - ")\n", - "\n", - "display_eval_report(byod_response_eval_tool_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9eU3LG6r7q-3" - }, - "source": [ - "### Visualize evaluation results\n", - "\n", - "Visualize evaluation result sample.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pQFzmd2I7q-3" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0FEbvEOkZS8f" - }, - "outputs": [], - "source": [ - "display_radar_plot(\n", - " byod_response_eval_tool_result,\n", - " title=\"Response Metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2a4e033321ad" - }, - "source": [ - "## Cleaning up\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Ox2I3UfRlTOd" - }, - "outputs": [], - "source": [ - "delete_experiment = True\n", - "delete_remote_agent = True\n", - "\n", - "if delete_experiment:\n", - " try:\n", - " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", - " experiment.delete(delete_backing_tensorboard_runs=True)\n", - " except Exception as e:\n", - " print(e)\n", - "\n", - "if delete_remote_agent:\n", - " try:\n", - " remote_custom_agent.delete()\n", - " except Exception as e:\n", - " print(e)" - ] - } - ], - "metadata": { - "colab": { - "name": "evaluating_crewai_agent_reasoning_engine_customized_template.ipynb", - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Evaluate an CrewAI agent on Vertex AI Reasoning Engine (Customized template)\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Google
Open in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Open in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "Share to:\n", + "\n", + "\n", + " \"LinkedIn\n", + "\n", + "\n", + "\n", + " \"Bluesky\n", + "\n", + "\n", + "\n", + " \"X\n", + "\n", + "\n", + "\n", + " \"Reddit\n", + "\n", + "\n", + "\n", + " \"Facebook\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "84f0f73a0f76" + }, + "source": [ + "| | |\n", + "|-|-|\n", + "| Authors | [Naveksha Sood](https://github.com/navekshasood) [Ivan Nardini](https://github.com/inardini) |" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", + "\n", + "This tutorial shows how to evaluate an CrewAI agent with customized template on Vertex AI Reasoning Engine using Vertex AI Gen AI Evaluation.\n", + "\n", + "The tutorial uses the following Google Cloud services and resources:\n", + "\n", + "* Vertex AI Gen AI Evaluation\n", + "* Vertex AI Reasoning Engine\n", + "\n", + "The steps performed include:\n", + "\n", + "* Build and deploy an agent using CrewAI on Vertex AI Reasoning Engine\n", + "* Prepare Agent Evaluation dataset\n", + "* Single tool usage evaluation\n", + "* Trajectory evaluation\n", + "* Response evaluation\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "61RBz8LLbxCR" + }, + "source": [ + "## Get started" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "No17Cw5hgx12" + }, + "source": [ + "### Install Vertex AI SDK and other required packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tFy3H3aPgx12" + }, + "outputs": [], + "source": [ + "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation, reasoningengine]\" \\\n", + " \"crewai\" \"crewai-tools\" \\\n", + " \"cloudpickle==3.0.0\" \\\n", + " \"pydantic==2.7.4\" \\\n", + " \"requests\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R5Xep4W9lq-Z" + }, + "source": [ + "### Restart runtime\n", + "\n", + "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", + "\n", + "The restart might take a minute or longer. After it's restarted, continue to the next step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XRvKdaPDTznN" + }, + "outputs": [], + "source": [ + "import IPython\n", + "\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SbmM4z7FOBpM" + }, + "source": [ + "
\n", + "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dmWOrTJ3gx13" + }, + "source": [ + "### Authenticate your notebook environment (Colab only)\n", + "\n", + "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NyKGtVQjgx13" + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " from google.colab import auth\n", + "\n", + " auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF4l8DTdWgPY" + }, + "source": [ + "### Set Google Cloud project information and initialize Vertex AI SDK\n", + "\n", + "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nqwi-5ufWp_B" + }, + "outputs": [], + "source": [ + "# Use the environment variable if the user doesn't provide Project ID.\n", + "import os\n", + "\n", + "import vertexai\n", + "\n", + "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", + "\n", + "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", + " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", + "\n", + "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", + "\n", + "BUCKET_NAME = \"[your-bucket-name]\" # @param {type: \"string\", placeholder: \"[your-bucket-name]\", isTemplate: true}\n", + "\n", + "if not BUCKET_NAME or BUCKET_NAME == \"[your-bucket-name]\":\n", + " BUCKET_NAME = f\"{PROJECT_ID}-bucket\"\n", + "\n", + "BUCKET_URI = f\"gs://evaluate_agents\"\n", + "\n", + "! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI\n", + "\n", + "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", + "\n", + "vertexai.init(\n", + " project=PROJECT_ID,\n", + " location=LOCATION,\n", + " staging_bucket=BUCKET_URI,\n", + " experiment=EXPERIMENT_NAME,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5303c05f7aa6" + }, + "source": [ + "## Import libraries\n", + "\n", + "Import tutorial libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6fc324893334" + }, + "outputs": [], + "source": [ + "# General\n", + "import random\n", + "import string\n", + "\n", + "from IPython.display import HTML, Markdown, display\n", + "\n", + "# Build agent\n", + "from crewai import Agent, Crew, Process, Task\n", + "from crewai.flow.flow import Flow, listen, start\n", + "from crewai_tools import tool\n", + "\n", + "# Evaluate agent\n", + "from google.cloud import aiplatform\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "from vertexai.preview import reasoning_engines\n", + "from vertexai.preview.evaluation import EvalTask\n", + "from vertexai.preview.evaluation.metrics import (\n", + " PointwiseMetric,\n", + " PointwiseMetricPromptTemplate,\n", + " TrajectorySingleToolUse,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MVnBDX54gz7j" + }, + "source": [ + "## Define helper functions\n", + "\n", + "Initiate a set of helper functions to print tutorial results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uSgWjMD_g1_v" + }, + "outputs": [], + "source": [ + "def get_id(length: int = 8) -> str:\n", + " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", + " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", + "\n", + "\n", + "def parse_crewai_output_to_dictionary(crew, crew_output):\n", + " \"\"\"\n", + " Parse CrewAI output into a structured dictionary format.\n", + " \"\"\"\n", + " final_output = {\"response\": str(crew_output), \"predicted_trajectory\": []}\n", + "\n", + " try:\n", + " # Access tools_results directly from each agent\n", + " for agent in crew.agents:\n", + " if hasattr(agent, \"tools_results\"):\n", + " for tool_result in agent.tools_results:\n", + " tool_info = {\n", + " \"tool_name\": tool_result.get(\"tool_name\", \"\"),\n", + " \"tool_input\": tool_result.get(\"tool_args\", {}),\n", + " }\n", + " final_output[\"predicted_trajectory\"].append(tool_info)\n", + "\n", + " except Exception as e:\n", + " final_output[\"error\"] = f\"Error parsing tools results: {str(e)}\"\n", + "\n", + " return final_output\n", + "\n", + "\n", + "def format_output_as_markdown(output: dict) -> str:\n", + " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", + " markdown = \"### AI Response\\n\"\n", + " markdown += f\"{output['response']}\\n\\n\"\n", + "\n", + " if output[\"predicted_trajectory\"]:\n", + " markdown += \"### Function Calls\\n\"\n", + " for call in output[\"predicted_trajectory\"]:\n", + " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", + " markdown += \" - **Arguments**:\\n\"\n", + " for key, value in call[\"tool_input\"].items():\n", + " markdown += f\" - `{key}`: `{value}`\\n\"\n", + "\n", + " return markdown\n", + "\n", + "\n", + "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", + " \"\"\"Display the evaluation results.\"\"\"\n", + " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", + " display(Markdown(\"### Summary Metrics\"))\n", + " display(metrics_df)\n", + "\n", + " display(Markdown(f\"### Row-wise Metrics\"))\n", + " display(eval_result.metrics_table)\n", + "\n", + "\n", + "def display_drilldown(row: pd.Series) -> None:\n", + " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", + "\n", + " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", + "\n", + " if not (\n", + " isinstance(row[\"predicted_trajectory\"], list)\n", + " and isinstance(row[\"reference_trajectory\"], list)\n", + " ):\n", + " return\n", + "\n", + " for predicted_trajectory, reference_trajectory in zip(\n", + " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", + " ):\n", + " display(\n", + " HTML(\n", + " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", + " )\n", + " )\n", + "\n", + " if not (\n", + " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", + " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", + " ):\n", + " continue\n", + "\n", + " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", + " print(\"Tool Input Key: \", tool_input_key)\n", + "\n", + " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " reference_trajectory[\"tool_input\"][tool_input_key],\n", + " )\n", + " else:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " \"N/A\",\n", + " )\n", + " print(\"\\n\")\n", + " display(HTML(\"
\"))\n", + "\n", + "\n", + "def display_dataframe_rows(\n", + " df: pd.DataFrame,\n", + " columns: list[str] | None = None,\n", + " num_rows: int = 3,\n", + " display_drilldown: bool = False,\n", + ") -> None:\n", + " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", + "\n", + " if columns:\n", + " df = df[columns]\n", + "\n", + " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", + " header_style = base_style + \"font-weight: bold;\"\n", + "\n", + " for _, row in df.head(num_rows).iterrows():\n", + " for column in df.columns:\n", + " display(\n", + " HTML(\n", + " f\"{column.replace('_', ' ').title()}: \"\n", + " )\n", + " )\n", + " display(HTML(f\"{row[column]}
\"))\n", + "\n", + " display(HTML(\"
\"))\n", + "\n", + " if (\n", + " display_drilldown\n", + " and \"predicted_trajectory\" in df.columns\n", + " and \"reference_trajectory\" in df.columns\n", + " ):\n", + " display_drilldown(row)\n", + "\n", + "\n", + "def plot_bar_plot(\n", + " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", + ") -> None:\n", + " fig = go.Figure()\n", + " data = []\n", + "\n", + " summary_metrics = eval_result.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " data.append(\n", + " go.Bar(\n", + " x=list(summary_metrics.keys()),\n", + " y=list(summary_metrics.values()),\n", + " name=title,\n", + " )\n", + " )\n", + "\n", + " fig = go.Figure(data=data)\n", + "\n", + " # Change the bar mode\n", + " fig.update_layout(barmode=\"group\")\n", + " fig.show()\n", + "\n", + "\n", + "def display_radar_plot(eval_results, title: str, metrics=None):\n", + " \"\"\"Plot the radar plot.\"\"\"\n", + " fig = go.Figure()\n", + " summary_metrics = eval_results.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " min_val = min(summary_metrics.values())\n", + " max_val = max(summary_metrics.values())\n", + "\n", + " fig.add_trace(\n", + " go.Scatterpolar(\n", + " r=list(summary_metrics.values()),\n", + " theta=list(summary_metrics.keys()),\n", + " fill=\"toself\",\n", + " name=title,\n", + " )\n", + " )\n", + " fig.update_layout(\n", + " title=title,\n", + " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", + " showlegend=True,\n", + " )\n", + " fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bDaa2Mtsifmq" + }, + "source": [ + "## Build an agent using Vertex AI Reasoning Engine's customized template\n", + "\n", + "Build your application using CrewAI, including the Gemini model and custom tools that you define.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KHwShhpOitKp" + }, + "source": [ + "### Set tools\n", + "\n", + "To start, set the tools that a customer support agent needs to do their job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gA2ZKvfeislw" + }, + "outputs": [], + "source": [ + "@tool\n", + "def get_product_details(product_name: str):\n", + " \"\"\"Gathers basic details about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", + " \"usb charger\": \"A super fast and light usb charger\",\n", + " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", + " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", + " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " }\n", + " return details.get(product_name, \"Product details not found.\")\n", + "\n", + "\n", + "@tool\n", + "def get_product_price(product_name: str):\n", + " \"\"\"Gathers price about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": 500,\n", + " \"usb charger\": 10,\n", + " \"shoes\": 100,\n", + " \"headphones\": 50,\n", + " \"speaker\": 80,\n", + " }\n", + " return details.get(product_name, \"Product price not found.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "be70714d9fae" + }, + "source": [ + "### Define router using Flow\n", + "\n", + "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "516b5108d327" + }, + "outputs": [], + "source": [ + "class ProductFlow(Flow):\n", + " @start\n", + " def begin_flow(self):\n", + " \"\"\"Starts the product information flow\"\"\"\n", + " return \"check_request\"\n", + "\n", + " @listen(\"check_request\")\n", + " def router(self, state: dict) -> str:\n", + " \"\"\"Routes the product request to appropriate handler\"\"\"\n", + " # Get the last message from the state\n", + " last_message = state.get(\"last_message\", {})\n", + " tool_calls = last_message.get(\"tool_calls\", [])\n", + "\n", + " if tool_calls:\n", + " function_name = tool_calls[0].get(\"name\")\n", + " if function_name == \"get_product_price\":\n", + " return \"get_product_price\"\n", + " else:\n", + " return \"get_product_details\"\n", + " return \"end\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FHjhBVx2cHWb" + }, + "source": [ + "### Set the model\n", + "\n", + "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iCx9hbpccHWc" + }, + "outputs": [], + "source": [ + "model = \"vertex_ai/gemini-1.5-pro-002\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNlAY9cojEWz" + }, + "source": [ + "### Assemble the agent\n", + "\n", + "To create a CrewAI agent using [Vertex AI Reasoning Engine](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy), use the [customized template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize).\n", + "\n", + "This class helps you quickly get an agent using any framework running Vertex AI Reasoning Engine.\n", + "\n", + "To learn more about the template, check out [Customize an application template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize) documentation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dAFdi7SujGP8" + }, + "outputs": [], + "source": [ + "class CrewAIApp:\n", + " def __init__(self, project: str, location: str, model: str = model) -> None:\n", + " self.project_id = project\n", + " self.location = location\n", + " self.model = model\n", + "\n", + " # The set_up method is used to define application initialization logic\n", + " def set_up(self) -> None:\n", + " \"\"\"Set up the application.\"\"\"\n", + " os.environ[\"GOOGLE_CLOUD_PROJECT\"] = self.project_id\n", + " return\n", + "\n", + " # The query method will be used to send inputs to the agent\n", + " def query(self, input: str):\n", + " \"\"\"Query the application.\"\"\"\n", + " product_researcher = Agent(\n", + " role=\"Product Researcher\",\n", + " goal=\"Research product details and prices accurately\",\n", + " backstory=\"Expert at gathering and analyzing product information\",\n", + " llm=model,\n", + " tools=[get_product_details, get_product_price],\n", + " allow_delegation=False,\n", + " )\n", + "\n", + " research_task = Task(\n", + " description=f\"Analyze this user request: '{input}'. \"\n", + " f\"If the request is about price, use get_product_price tool. \"\n", + " f\"Otherwise, use get_product_details tool to get product information.\",\n", + " expected_output=\"Product information including details and/or price based on the user request.\",\n", + " agent=product_researcher,\n", + " )\n", + "\n", + " crew = Crew(\n", + " agents=[product_researcher],\n", + " tasks=[research_task],\n", + " process=Process.sequential,\n", + " )\n", + "\n", + " result = crew.kickoff()\n", + " return parse_crewai_output_to_dictionary(crew, result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_HGcs6PVjRj_" + }, + "source": [ + "### Test the local agent\n", + "\n", + "Query your agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1dXLLgBudu_L" + }, + "outputs": [], + "source": [ + "local_custom_agent = CrewAIApp(project=PROJECT_ID, location=LOCATION)\n", + "local_custom_agent.set_up()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PgkOhPmN3aCZ" + }, + "outputs": [], + "source": [ + "response = local_custom_agent.query(input=\"Get product details for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lGb58OJkjUs9" + }, + "outputs": [], + "source": [ + "response = local_custom_agent.query(input=\"Get product price for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2pSItXD5e4QD" + }, + "source": [ + "### Deploy the local agent to Vertex AI Reasoning Engine\n", + "\n", + "To deploy the local agent on Vertex AI Reasoning Engine, you can use the `create` method by passing the agent and some specify dependencies (`requirements` for external PyPI packages and `extra_packages` for local packages ).\n", + "\n", + "Look at [Deploy the application](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy#create_a_reasoningengine_instance) documentation page to learn more. \n", + "\n", + "> The agent deployment on Vertex AI Reasoning Engine would require ~ 10 mins." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3HLz_a1We4QE" + }, + "outputs": [], + "source": [ + "local_custom_agent = CrewAIApp(project=PROJECT_ID, location=LOCATION)\n", + "\n", + "remote_custom_agent = reasoning_engines.ReasoningEngine.create(\n", + " local_custom_agent,\n", + " requirements=[\n", + " \"google-cloud-aiplatform[reasoningengine]\",\n", + " \"crewai\",\n", + " \"crewai-tools\",\n", + " \"cloudpickle==3.0.0\",\n", + " \"pydantic==2.7.4\",\n", + " \"requests\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nu4RO1P9e4QE" + }, + "source": [ + "### Test the remote agent\n", + "\n", + "Query your remote agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sqBtzYyce4QE" + }, + "outputs": [], + "source": [ + "response = remote_custom_agent.query(input=\"Get product details for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aOGPePsorpUl" + }, + "source": [ + "## Evaluating a custom agent with Vertex AI Gen AI Evaluation\n", + "\n", + "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", + "\n", + "Monitoring focuses on how well your agent is performing specific tasks:\n", + "\n", + "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", + "\n", + "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", + "\n", + "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", + "\n", + "Observability is about understanding the overall health of the agent:\n", + "\n", + "* **Latency**: How long does it take the agent to respond?\n", + "\n", + "* **Failure Rate**: How often does the agent fail to produce a response?\n", + "\n", + "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e43229f3ad4f" + }, + "source": [ + "### Prepare Agent Evaluation dataset\n", + "\n", + "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", + "\n", + "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", + "\n", + "\n", + "> Optionally, you can provide both generated responses and predicted trajectory (**bring-your-own-dataset scenario**).\n", + "\n", + "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fFf8uTdUiDt3" + }, + "outputs": [], + "source": [ + "eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + "}\n", + "\n", + "eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PQEI1EcfvFHb" + }, + "source": [ + "Print some samples from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EjsonqWWvIvE" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(eval_sample_dataset, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "htCrOS9fRVi8" + }, + "source": [ + "### Prepare an Agent function\n", + "\n", + "In this scenario with a custom agent, you need an agent function to pass the agent output to Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GdO56MIDRZri" + }, + "outputs": [], + "source": [ + "def agent_parsed_response(input: str) -> dict:\n", + " \"\"\"Pass the agent output to Vertex AI Gen AI Evaluation.\"\"\"\n", + "\n", + " result = remote_custom_agent.query(input=input)\n", + "\n", + " return result" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m4CvBuf1afHG" + }, + "source": [ + "### Single tool usage evaluation\n", + "\n", + "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_rS5GGKHd5bx" + }, + "source": [ + "#### Set single tool usage metrics\n", + "\n", + "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", + "\n", + "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xixvq8dwd5by" + }, + "outputs": [], + "source": [ + "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ktKZoT2Qd5by" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using a local agent (local_1p_agent) and assigns a unique identifier to this specific evaluation run, storing the evaluation results.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QaMf9dqzySE6" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SRv43fDcd5by" + }, + "outputs": [], + "source": [ + "single_tool_call_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=single_tool_usage_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", + " runnable=agent_parsed_response, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(single_tool_call_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6o5BjSTFKVMS" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Use some helper functions to visualize a sample of evaluation result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZkpwPReipekr" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JlujdJpu5Kn6" + }, + "source": [ + "### Trajectory Evaluation\n", + "\n", + "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8s-nHdDJneHM" + }, + "source": [ + "#### Set trajectory metrics\n", + "\n", + "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", + "\n", + "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", + "\n", + "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", + "\n", + "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", + "\n", + "* `trajectory_precision`: proportion of predicted actions present in reference\n", + "\n", + "* `trajectory_recall`: proportion of reference actions present in predicted. \n", + "\n", + "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c32WIS95neHN" + }, + "outputs": [], + "source": [ + "trajectory_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"trajectory_any_order_match\",\n", + " \"trajectory_precision\",\n", + " \"trajectory_recall\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF3jhTH3neHN" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vOdS7TJUneHN" + }, + "outputs": [], + "source": [ + "EXPERIMENT_NAME = \"evaluate-re-agent-trajectory\" # @param {type:\"string\"}\n", + "\n", + "trajectory_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=agent_parsed_response)\n", + "\n", + "display_eval_report(trajectory_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DBiUI3LyLBtj" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Print and visualize a sample of evaluation results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sLVRdN5llA0h" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PrxM5sMZYXHP" + }, + "outputs": [], + "source": [ + "plot_bar_plot(\n", + " trajectory_eval_result,\n", + " title=\"Trajectory Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T8TipU2akHEd" + }, + "source": [ + "### Evaluate final response\n", + "\n", + "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DeK-py7ykkDN" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", + "\n", + "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cyGHGgeVklvz" + }, + "outputs": [], + "source": [ + "response_metrics = [\"safety\", \"coherence\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DaBJWcg1kn55" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wRb2EC_hknSD" + }, + "outputs": [], + "source": [ + "EXPERIMENT_NAME = \"evaluate-re-agent-response\" # @param {type:\"string\"}\n", + "\n", + "response_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "response_eval_result = response_eval_task.evaluate(runnable=agent_parsed_response)\n", + "\n", + "display_eval_report(response_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JtewTwiwg9qH" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cy0aRydrp9zW" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ntRBK3Te6PEc" + }, + "source": [ + "### Evaluate generated response conditioned by tool choosing\n", + "\n", + "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", + "\n", + "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4bENwFcd6prX" + }, + "source": [ + "#### Define a custom metric\n", + "\n", + "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", + "\n", + "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "txGEHcg76riI" + }, + "outputs": [], + "source": [ + "criteria = {\n", + " \"Follows trajectory\": (\n", + " \"Evaluate whether the agent's response logically follows from the \"\n", + " \"sequence of actions it took. Consider these sub-points:\\n\"\n", + " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", + " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", + " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", + " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", + " )\n", + "}\n", + "\n", + "pointwise_rating_rubric = {\n", + " \"1\": \"Follows trajectory\",\n", + " \"0\": \"Does not follow trajectory\",\n", + "}\n", + "\n", + "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", + " criteria=criteria,\n", + " rating_rubric=pointwise_rating_rubric,\n", + " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8MJqXu0kikxd" + }, + "source": [ + "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5EL7iEDMikNQ" + }, + "outputs": [], + "source": [ + "print(response_follows_trajectory_prompt_template.prompt_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e1djVp7Fi4Yy" + }, + "source": [ + "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nx1xbZD87iMj" + }, + "outputs": [], + "source": [ + "response_follows_trajectory_metric = PointwiseMetric(\n", + " metric=\"response_follows_trajectory\",\n", + " metric_prompt_template=response_follows_trajectory_prompt_template,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1pmxLwTe7Ywv" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "Set new generated response evaluation metrics by including the custom metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wrsbVFDd7Ywv" + }, + "outputs": [], + "source": [ + "response_tool_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"safety\",\n", + " response_follows_trajectory_metric,\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lo-Sza807Ywv" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_dkb4gSn7Ywv" + }, + "outputs": [], + "source": [ + "EXPERIMENT_NAME = \"evaluate-re-agent-response-by-tools\" # @param {type:\"string\"}\n", + "\n", + "response_eval_tool_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "response_eval_tool_result = response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_response\n", + ")\n", + "\n", + "display_eval_report(response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EFmnRBlWqJnC" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZODTRuq2lF75" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4nuUDP3a2eTB" + }, + "source": [ + "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", + "\n", + "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pNh3-NDuZGDl" + }, + "source": [ + "### Bring your own evaluation dataset\n", + "\n", + "Define the evaluation dataset with the predicted trajectory and the generated response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y9hBgsg324Ej" + }, + "outputs": [], + "source": [ + "byod_eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"generated_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"response\": [\n", + " 500,\n", + " 50,\n", + " \"A super fast and light usb charger\",\n", + " 100,\n", + " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " ],\n", + "}\n", + "\n", + "byod_eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oEYmU2eJ7q-1" + }, + "source": [ + "### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wBD-4wpB7q-3" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", + "\n", + "byod_response_eval_tool_task = EvalTask(\n", + " dataset=byod_eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_response, experiment_run_name=EXPERIMENT_RUN_NAME\n", + ")\n", + "\n", + "display_eval_report(byod_response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9eU3LG6r7q-3" + }, + "source": [ + "### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pQFzmd2I7q-3" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0FEbvEOkZS8f" + }, + "outputs": [], + "source": [ + "display_radar_plot(\n", + " byod_response_eval_tool_result,\n", + " title=\"Response Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2a4e033321ad" + }, + "source": [ + "## Cleaning up\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ox2I3UfRlTOd" + }, + "outputs": [], + "source": [ + "delete_experiment = True\n", + "delete_remote_agent = True\n", + "\n", + "if delete_experiment:\n", + " try:\n", + " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", + " experiment.delete(delete_backing_tensorboard_runs=True)\n", + " except Exception as e:\n", + " print(e)\n", + "\n", + "if delete_remote_agent:\n", + " try:\n", + " remote_custom_agent.delete()\n", + " except Exception as e:\n", + " print(e)" + ] + } + ], + "metadata": { + "colab": { + "name": "evaluating_crewai_agent_reasoning_engine_customized_template.ipynb", + "toc_visible": true + }, + "environment": { + "kernel": "python3", + "name": "common-cpu.m126", + "type": "gcloud", + "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/base-cpu:m126" + }, + "kernelspec": { + "display_name": "Python 3 (Local)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } diff --git a/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb b/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb index 3ba858d797..7db11bfc2d 100644 --- a/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb +++ b/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb @@ -1,1529 +1,1548 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ur8xi4C7S06n" - }, - "outputs": [], - "source": [ - "# Copyright 2024 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JAPoU8Sm5E6e" - }, - "source": [ - "# Evaluating an LangChain Agent on Vertex AI Reasoning Engine (Prebuilt template)\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \"Google
Open in Colab\n", - "
\n", - "
\n", - " \n", - " \"Google
Open in Colab Enterprise\n", - "
\n", - "
\n", - " \n", - " \"Vertex
Open in Vertex AI Workbench\n", - "
\n", - "
\n", - " \n", - " \"GitHub
View on GitHub\n", - "
\n", - "
\n", - "\n", - "
\n", - "\n", - "Share to:\n", - "\n", - "\n", - " \"LinkedIn\n", - "\n", - "\n", - "\n", - " \"Bluesky\n", - "\n", - "\n", - "\n", - " \"X\n", - "\n", - "\n", - "\n", - " \"Reddit\n", - "\n", - "\n", - "\n", - " \"Facebook\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "84f0f73a0f76" - }, - "source": [ - "| | |\n", - "|-|-|\n", - "| Authors | [Naveksha Sood](https://github.com/navekshasood) [Ivan Nardini](https://github.com/inardini) |" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tvgnzT1CKxrO" - }, - "source": [ - "## Overview\n", - "\n", - "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", - "\n", - "This tutorial shows how to evaluate a first-party Reasoning Engine Agent using Vertex AI Gen AI Evaluation for agent evaluation.\n", - "\n", - "The tutorial uses the following Google Cloud services and resources:\n", - "\n", - "* Vertex AI Gen AI Evaluation\n", - "* Vertex AI Reasoning Engine\n", - "\n", - "The steps performed include:\n", - "\n", - "* Build and deploy an agent using LangChain\n", - "* Prepare Agent Evaluation dataset\n", - "* Single tool usage evaluation\n", - "* Trajectory evaluation\n", - "* Response evaluation" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "61RBz8LLbxCR" - }, - "source": [ - "## Get started" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "No17Cw5hgx12" - }, - "source": [ - "### Install Vertex AI SDK and other required packages\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tFy3H3aPgx12" - }, - "outputs": [], - "source": [ - "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation, langchain, reasoningengine]\" \\\n", - " \"langchain_google_vertexai\" \\\n", - " \"cloudpickle==3.0.0\" \\\n", - " \"pydantic==2.7.4\" \\\n", - " \"requests\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "R5Xep4W9lq-Z" - }, - "source": [ - "### Restart runtime\n", - "\n", - "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", - "\n", - "The restart might take a minute or longer. After it's restarted, continue to the next step." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XRvKdaPDTznN" - }, - "outputs": [], - "source": [ - "import IPython\n", - "\n", - "app = IPython.Application.instance()\n", - "app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SbmM4z7FOBpM" - }, - "source": [ - "
\n", - "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", - "
\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dmWOrTJ3gx13" - }, - "source": [ - "### Authenticate your notebook environment (Colab only)\n", - "\n", - "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NyKGtVQjgx13" - }, - "outputs": [], - "source": [ - "import sys\n", - "\n", - "if \"google.colab\" in sys.modules:\n", - " from google.colab import auth\n", - "\n", - " auth.authenticate_user()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF4l8DTdWgPY" - }, - "source": [ - "### Set Google Cloud project information and initialize Vertex AI SDK\n", - "\n", - "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", - "\n", - "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nqwi-5ufWp_B" - }, - "outputs": [], - "source": [ - "# Use the environment variable if the user doesn't provide Project ID.\n", - "import os\n", - "\n", - "import vertexai\n", - "\n", - "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", - "\n", - "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", - " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", - "\n", - "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", - "\n", - "BUCKET_NAME = \"[your-bucket-name]\" # @param {type: \"string\", placeholder: \"[your-bucket-name]\", isTemplate: true}\n", - "BUCKET_URI = f\"gs://evaluate_agents\"\n", - "\n", - "if not BUCKET_NAME or BUCKET_NAME == \"[your-bucket-name]\":\n", - " BUCKET_NAME = f\"{PROJECT_ID}-bucket\"\n", - "\n", - "! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI\n", - "\n", - "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", - "\n", - "vertexai.init(\n", - " project=PROJECT_ID,\n", - " location=LOCATION,\n", - " staging_bucket=BUCKET_URI,\n", - " experiment=EXPERIMENT_NAME,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5303c05f7aa6" - }, - "source": [ - "## Import libraries\n", - "\n", - "Import tutorial libraries." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6fc324893334" - }, - "outputs": [], - "source": [ - "# General\n", - "import random\n", - "import string\n", - "\n", - "from IPython.display import HTML, Markdown, display\n", - "\n", - "# Build agent\n", - "from google.cloud import aiplatform\n", - "import pandas as pd\n", - "import plotly.graph_objects as go\n", - "from vertexai.preview import reasoning_engines\n", - "\n", - "# Evaluate agent\n", - "from vertexai.preview.evaluation import EvalTask\n", - "from vertexai.preview.evaluation.metrics import (\n", - " PointwiseMetric,\n", - " PointwiseMetricPromptTemplate,\n", - " TrajectorySingleToolUse,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MVnBDX54gz7j" - }, - "source": [ - "## Define helper functions\n", - "\n", - "Initiate a set of helper functions to print tutorial results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uSgWjMD_g1_v" - }, - "outputs": [], - "source": [ - "def get_id(length: int = 8) -> str:\n", - " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", - " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", - "\n", - "\n", - "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", - " \"\"\"Display the evaluation results.\"\"\"\n", - " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", - " display(Markdown(\"### Summary Metrics\"))\n", - " display(metrics_df)\n", - "\n", - " display(Markdown(f\"### Row-wise Metrics\"))\n", - " display(eval_result.metrics_table)\n", - "\n", - "\n", - "def display_drilldown(row: pd.Series) -> None:\n", - " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", - "\n", - " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", - "\n", - " if not (\n", - " isinstance(row[\"predicted_trajectory\"], list)\n", - " and isinstance(row[\"reference_trajectory\"], list)\n", - " ):\n", - " return\n", - "\n", - " for predicted_trajectory, reference_trajectory in zip(\n", - " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", - " ):\n", - " display(\n", - " HTML(\n", - " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", - " )\n", - " )\n", - "\n", - " if not (\n", - " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", - " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", - " ):\n", - " continue\n", - "\n", - " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", - " print(\"Tool Input Key: \", tool_input_key)\n", - "\n", - " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", - " print(\n", - " \"Tool Values: \",\n", - " predicted_trajectory[\"tool_input\"][tool_input_key],\n", - " reference_trajectory[\"tool_input\"][tool_input_key],\n", - " )\n", - " else:\n", - " print(\n", - " \"Tool Values: \",\n", - " predicted_trajectory[\"tool_input\"][tool_input_key],\n", - " \"N/A\",\n", - " )\n", - " print(\"\\n\")\n", - " display(HTML(\"
\"))\n", - "\n", - "\n", - "def display_dataframe_rows(\n", - " df: pd.DataFrame,\n", - " columns: list[str] | None = None,\n", - " num_rows: int = 3,\n", - " display_drilldown: bool = False,\n", - ") -> None:\n", - " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", - "\n", - " if columns:\n", - " df = df[columns]\n", - "\n", - " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", - " header_style = base_style + \"font-weight: bold;\"\n", - "\n", - " for _, row in df.head(num_rows).iterrows():\n", - " for column in df.columns:\n", - " display(\n", - " HTML(\n", - " f\"{column.replace('_', ' ').title()}: \"\n", - " )\n", - " )\n", - " display(HTML(f\"{row[column]}
\"))\n", - "\n", - " display(HTML(\"
\"))\n", - "\n", - " if (\n", - " display_drilldown\n", - " and \"predicted_trajectory\" in df.columns\n", - " and \"reference_trajectory\" in df.columns\n", - " ):\n", - " display_drilldown(row)\n", - "\n", - "\n", - "def plot_bar_plot(\n", - " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", - ") -> None:\n", - " fig = go.Figure()\n", - " data = []\n", - "\n", - " summary_metrics = eval_result.summary_metrics\n", - " if metrics:\n", - " summary_metrics = {\n", - " k: summary_metrics[k]\n", - " for k, v in summary_metrics.items()\n", - " if any(selected_metric in k for selected_metric in metrics)\n", - " }\n", - "\n", - " data.append(\n", - " go.Bar(\n", - " x=list(summary_metrics.keys()),\n", - " y=list(summary_metrics.values()),\n", - " name=title,\n", - " )\n", - " )\n", - "\n", - " fig = go.Figure(data=data)\n", - "\n", - " # Change the bar mode\n", - " fig.update_layout(barmode=\"group\")\n", - " fig.show()\n", - "\n", - "\n", - "def display_radar_plot(eval_results, title: str, metrics=None):\n", - " \"\"\"Plot the radar plot.\"\"\"\n", - " fig = go.Figure()\n", - " summary_metrics = eval_results.summary_metrics\n", - " if metrics:\n", - " summary_metrics = {\n", - " k: summary_metrics[k]\n", - " for k, v in summary_metrics.items()\n", - " if any(selected_metric in k for selected_metric in metrics)\n", - " }\n", - "\n", - " min_val = min(summary_metrics.values())\n", - " max_val = max(summary_metrics.values())\n", - "\n", - " fig.add_trace(\n", - " go.Scatterpolar(\n", - " r=list(summary_metrics.values()),\n", - " theta=list(summary_metrics.keys()),\n", - " fill=\"toself\",\n", - " name=title,\n", - " )\n", - " )\n", - " fig.update_layout(\n", - " title=title,\n", - " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", - " showlegend=True,\n", - " )\n", - " fig.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bDaa2Mtsifmq" - }, - "source": [ - "## Build and deploy a LangChain agent using Vertex AI Reasoning Engine's prebuilt template\n", - "\n", - "Build and deploy your application using LangChain, including the Gemini model and custom tools that you define.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KHwShhpOitKp" - }, - "source": [ - "### Set tools\n", - "\n", - "To start, create some tools the agent will need to do their job. We are just going to pretend there's a database for this colab, but you would wire into your database or third party system for a real agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gA2ZKvfeislw" - }, - "outputs": [], - "source": [ - "def get_product_details(product_name: str):\n", - " \"\"\"Gathers basic details about a product.\"\"\"\n", - " details = {\n", - " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", - " \"usb charger\": \"A super fast and light usb charger\",\n", - " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", - " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", - " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", - " }\n", - " return details.get(product_name, \"Product details not found.\")\n", - "\n", - "\n", - "def get_product_price(product_name: str):\n", - " \"\"\"Gathers price about a product.\"\"\"\n", - " details = {\n", - " \"smartphone\": 500,\n", - " \"usb charger\": 10,\n", - " \"shoes\": 100,\n", - " \"headphones\": 50,\n", - " \"speaker\": 80,\n", - " }\n", - " return details.get(product_name, \"Product price not found.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "l4mk5XPui4Y1" - }, - "source": [ - "### Set the model\n", - "\n", - "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "BaYeo6K2i-w1" - }, - "outputs": [], - "source": [ - "model = \"gemini-1.5-pro\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tNlAY9cojEWz" - }, - "source": [ - "### Assemble the agent\n", - "\n", - "To create a LangChain agent using [Vertex AI Reasoning Engine](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy), use the LangchainAgent class. This class helps you quickly get an agent running with a standard template. Think of it as a shortcut for building agents – you don't have to start from scratch. The LangchainAgent handles the basic structure and initial configuration, allowing you to get right into using the agent.\n", - "\n", - "> Note the additional parameter `agent_executor_kwargs` which would allow to return tool calls made by agent so you can evaluate them.\n", - "\n", - "The Vertex AI GenAI Evaluation works directly with 'Queryable' agents (like in this case), and also lets you add your own custom functions with a specific structure (signature)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dAFdi7SujGP8" - }, - "outputs": [], - "source": [ - "local_1p_agent = reasoning_engines.LangchainAgent(\n", - " model=model,\n", - " tools=[get_product_details, get_product_price],\n", - " agent_executor_kwargs={\"return_intermediate_steps\": True},\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_HGcs6PVjRj_" - }, - "source": [ - "### Test the local agent\n", - "\n", - "Query your agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "lGb58OJkjUs9" - }, - "outputs": [], - "source": [ - "response = local_1p_agent.query(input=\"Get product details for shoes\")\n", - "display(Markdown(response[\"output\"]))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "INqf60zPWP6L" - }, - "outputs": [], - "source": [ - "response = local_1p_agent.query(input=\"Get product price for shoes\")\n", - "display(Markdown(response[\"output\"]))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dP5g16W1rzMI" - }, - "source": [ - "### Deploy the local agent to Vertex AI Reasoning Engine\n", - "\n", - "To deploy the local agent on Vertex AI Reasoning Engine, you can use the `create` method by passing the agent and some specify dependencies (`requirements` for external PyPI packages and `extra_packages` for local packages ).\n", - "\n", - "Look at [Deploy the application](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy#create_a_reasoningengine_instance) documentation page to learn more. \n", - "\n", - "> The agent deployment on Vertex AI Reasoning Engine would require ~ 10 mins." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GPNpD676r6T2" - }, - "outputs": [], - "source": [ - "remote_1p_agent = reasoning_engines.ReasoningEngine.create(\n", - " local_1p_agent,\n", - " requirements=[\n", - " \"google-cloud-aiplatform[langchain,reasoningengine]\",\n", - " \"langchain_google_vertexai\",\n", - " \"cloudpickle==3.0.0\",\n", - " \"pydantic==2.7.4\",\n", - " \"requests\",\n", - " ],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GjZMd82vHRh3" - }, - "source": [ - "### Test the remote agent\n", - "\n", - "Query your remote agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "KSCznbhbHRh3" - }, - "outputs": [], - "source": [ - "response = remote_1p_agent.query(input=\"Get product details for shoes\")\n", - "display(Markdown(response[\"output\"]))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aOGPePsorpUl" - }, - "source": [ - "## Evaluating an agent with Vertex AI Gen AI Evaluation\n", - "\n", - "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", - "\n", - "Monitoring focuses on how well your agent is performing specific tasks:\n", - "\n", - "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", - "\n", - "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", - "\n", - "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", - "\n", - "Observability is about understanding the overall health of the agent:\n", - "\n", - "* **Latency**: How long does it take the agent to respond?\n", - "\n", - "* **Failure Rate**: How often does the agent fail to produce a response?\n", - "\n", - "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e43229f3ad4f" - }, - "source": [ - "### Prepare Agent Evaluation dataset\n", - "\n", - "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", - "\n", - "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", - "\n", - "> Optionally, you can provide both generated responses and predicted trajectory (**bring-your-own-dataset scenario**).\n", - "\n", - "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fFf8uTdUiDt3" - }, - "outputs": [], - "source": [ - "eval_data = {\n", - " \"prompt\": [\n", - " \"Get price for smartphone\",\n", - " \"Get product details and price for headphones\",\n", - " \"Get details for usb charger\",\n", - " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\",\n", - " ],\n", - " \"reference_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - "}\n", - "\n", - "eval_sample_dataset = pd.DataFrame(eval_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PQEI1EcfvFHb" - }, - "source": [ - "Print some samples from the dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "EjsonqWWvIvE" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(eval_sample_dataset, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "m4CvBuf1afHG" - }, - "source": [ - "### Single tool usage evaluation\n", - "\n", - "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_rS5GGKHd5bx" - }, - "source": [ - "#### Set single tool usage metrics\n", - "\n", - "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", - "\n", - "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xixvq8dwd5by" - }, - "outputs": [], - "source": [ - "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ktKZoT2Qd5by" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using the remote agent and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QaMf9dqzySE6" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SRv43fDcd5by" - }, - "outputs": [], - "source": [ - "single_tool_call_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=single_tool_usage_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", - " runnable=remote_1p_agent, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(single_tool_call_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6o5BjSTFKVMS" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Use some helper functions to visualize a sample of evaluation result." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1Jopzw83k14w" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JlujdJpu5Kn6" - }, - "source": [ - "### Trajectory Evaluation\n", - "\n", - "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8s-nHdDJneHM" - }, - "source": [ - "#### Set trajectory metrics\n", - "\n", - "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", - "\n", - "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", - "\n", - "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", - "\n", - "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", - "\n", - "* `trajectory_precision`: proportion of predicted actions present in reference\n", - "\n", - "* `trajectory_recall`: proportion of reference actions present in predicted. \n", - "\n", - "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "c32WIS95neHN" - }, - "outputs": [], - "source": [ - "trajectory_metrics = [\n", - " \"trajectory_exact_match\",\n", - " \"trajectory_in_order_match\",\n", - " \"trajectory_any_order_match\",\n", - " \"trajectory_precision\",\n", - " \"trajectory_recall\",\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF3jhTH3neHN" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vOdS7TJUneHN" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"trajectory-{get_id()}\"\n", - "\n", - "trajectory_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", - ")\n", - "\n", - "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=remote_1p_agent)\n", - "\n", - "display_eval_report(trajectory_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DBiUI3LyLBtj" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Print and visualize a sample of evaluation results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sLVRdN5llA0h" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PrxM5sMZYXHP" - }, - "outputs": [], - "source": [ - "plot_bar_plot(\n", - " trajectory_eval_result,\n", - " title=\"Trajectory Metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "T8TipU2akHEd" - }, - "source": [ - "### Evaluate final response\n", - "\n", - "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DeK-py7ykkDN" - }, - "source": [ - "#### Set response metrics\n", - "\n", - "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", - "\n", - "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cyGHGgeVklvz" - }, - "outputs": [], - "source": [ - "response_metrics = [\"safety\", \"coherence\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DaBJWcg1kn55" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wRb2EC_hknSD" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"response-{get_id()}\"\n", - "\n", - "response_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", - ")\n", - "\n", - "response_eval_result = response_eval_task.evaluate(runnable=remote_1p_agent)\n", - "\n", - "display_eval_report(response_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JtewTwiwg9qH" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "\n", - "Print new evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZODTRuq2lF75" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ntRBK3Te6PEc" - }, - "source": [ - "### Evaluate generated response conditioned by tool choosing.\n", - "\n", - "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", - "\n", - "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4bENwFcd6prX" - }, - "source": [ - "#### Define a custom metric\n", - "\n", - "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", - "\n", - "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "txGEHcg76riI" - }, - "outputs": [], - "source": [ - "criteria = {\n", - " \"Follows trajectory\": (\n", - " \"Evaluate whether the agent's response logically follows from the \"\n", - " \"sequence of actions it took. Consider these sub-points:\\n\"\n", - " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", - " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", - " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", - " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", - " )\n", - "}\n", - "\n", - "pointwise_rating_rubric = {\n", - " \"1\": \"Follows trajectory\",\n", - " \"0\": \"Does not follow trajectory\",\n", - "}\n", - "\n", - "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", - " criteria=criteria,\n", - " rating_rubric=pointwise_rating_rubric,\n", - " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8MJqXu0kikxd" - }, - "source": [ - "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5EL7iEDMikNQ" - }, - "outputs": [], - "source": [ - "print(response_follows_trajectory_prompt_template.prompt_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e1djVp7Fi4Yy" - }, - "source": [ - "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nx1xbZD87iMj" - }, - "outputs": [], - "source": [ - "response_follows_trajectory_metric = PointwiseMetric(\n", - " metric=\"response_follows_trajectory\",\n", - " metric_prompt_template=response_follows_trajectory_prompt_template,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1pmxLwTe7Ywv" - }, - "source": [ - "#### Set response metrics\n", - "\n", - "Set new generated response evaluation metrics by including the custom metric.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wrsbVFDd7Ywv" - }, - "outputs": [], - "source": [ - "response_tool_metrics = [\n", - " \"trajectory_exact_match\",\n", - " \"trajectory_in_order_match\",\n", - " \"safety\",\n", - " response_follows_trajectory_metric,\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Lo-Sza807Ywv" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "Run a new agent's evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_dkb4gSn7Ywv" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"response-over-tools-{get_id()}\"\n", - "\n", - "response_eval_tool_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "response_eval_tool_result = response_eval_tool_task.evaluate(runnable=remote_1p_agent)\n", - "\n", - "display_eval_report(response_eval_tool_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AtOfIFi2j88g" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Visualize evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GH2YvXgLlLH7" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(response_eval_tool_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4nuUDP3a2eTB" - }, - "source": [ - "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", - "\n", - "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pNh3-NDuZGDl" - }, - "source": [ - "### Bring your own evaluation dataset\n", - "\n", - "Define the evaluation dataset with the predicted trajectory and the generated response." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "y9hBgsg324Ej" - }, - "outputs": [], - "source": [ - "byod_eval_data = {\n", - " \"prompt\": [\n", - " \"Get price for smartphone\",\n", - " \"Get product details and price for headphones\",\n", - " \"Get details for usb charger\",\n", - " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\",\n", - " ],\n", - " \"reference_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - " \"generated_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - " \"response\": [\n", - " 500,\n", - " 50,\n", - " \"A super fast and light usb charger\",\n", - " 100,\n", - " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", - " ],\n", - "}\n", - "\n", - "byod_eval_sample_dataset = pd.DataFrame(eval_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oEYmU2eJ7q-1" - }, - "source": [ - "### Run an evaluation task\n", - "\n", - "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wBD-4wpB7q-3" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", - "\n", - "byod_response_eval_tool_task = EvalTask(\n", - " dataset=byod_eval_sample_dataset,\n", - " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", - " runnable=remote_1p_agent, experiment_run_name=EXPERIMENT_RUN_NAME\n", - ")\n", - "\n", - "display_eval_report(byod_response_eval_tool_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9eU3LG6r7q-3" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Visualize evaluation result sample.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pQFzmd2I7q-3" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0FEbvEOkZS8f" - }, - "outputs": [], - "source": [ - "display_radar_plot(\n", - " byod_response_eval_tool_result,\n", - " title=\"Response Metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2a4e033321ad" - }, - "source": [ - "## Cleaning up\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Ox2I3UfRlTOd" - }, - "outputs": [], - "source": [ - "delete_experiment = True\n", - "delete_remote_agent = True\n", - "\n", - "if delete_experiment:\n", - " try:\n", - " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", - " experiment.delete(delete_backing_tensorboard_runs=True)\n", - " except Exception as e:\n", - " print(e)\n", - "\n", - "if delete_remote_agent:\n", - " try:\n", - " remote_1p_agent.delete()\n", - " except Exception as e:\n", - " print(e)" - ] - } - ], - "metadata": { - "colab": { - "name": "evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb", - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Evaluating an LangChain Agent on Vertex AI Reasoning Engine (Prebuilt template)\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Google
Open in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Open in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "Share to:\n", + "\n", + "\n", + " \"LinkedIn\n", + "\n", + "\n", + "\n", + " \"Bluesky\n", + "\n", + "\n", + "\n", + " \"X\n", + "\n", + "\n", + "\n", + " \"Reddit\n", + "\n", + "\n", + "\n", + " \"Facebook\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "84f0f73a0f76" + }, + "source": [ + "| | |\n", + "|-|-|\n", + "| Authors | [Naveksha Sood](https://github.com/navekshasood) [Ivan Nardini](https://github.com/inardini) |" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", + "\n", + "This tutorial shows how to evaluate a first-party Reasoning Engine Agent using Vertex AI Gen AI Evaluation for agent evaluation.\n", + "\n", + "The tutorial uses the following Google Cloud services and resources:\n", + "\n", + "* Vertex AI Gen AI Evaluation\n", + "* Vertex AI Reasoning Engine\n", + "\n", + "The steps performed include:\n", + "\n", + "* Build and deploy an agent using LangChain\n", + "* Prepare Agent Evaluation dataset\n", + "* Single tool usage evaluation\n", + "* Trajectory evaluation\n", + "* Response evaluation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "61RBz8LLbxCR" + }, + "source": [ + "## Get started" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "No17Cw5hgx12" + }, + "source": [ + "### Install Vertex AI SDK and other required packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tFy3H3aPgx12" + }, + "outputs": [], + "source": [ + "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation, langchain, reasoningengine]\" \\\n", + " \"langchain_google_vertexai\" \\\n", + " \"cloudpickle==3.0.0\" \\\n", + " \"pydantic==2.7.4\" \\\n", + " \"requests\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R5Xep4W9lq-Z" + }, + "source": [ + "### Restart runtime\n", + "\n", + "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", + "\n", + "The restart might take a minute or longer. After it's restarted, continue to the next step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XRvKdaPDTznN" + }, + "outputs": [], + "source": [ + "import IPython\n", + "\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SbmM4z7FOBpM" + }, + "source": [ + "
\n", + "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dmWOrTJ3gx13" + }, + "source": [ + "### Authenticate your notebook environment (Colab only)\n", + "\n", + "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NyKGtVQjgx13" + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " from google.colab import auth\n", + "\n", + " auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF4l8DTdWgPY" + }, + "source": [ + "### Set Google Cloud project information and initialize Vertex AI SDK\n", + "\n", + "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nqwi-5ufWp_B" + }, + "outputs": [], + "source": [ + "# Use the environment variable if the user doesn't provide Project ID.\n", + "import os\n", + "\n", + "import vertexai\n", + "\n", + "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", + "\n", + "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", + " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", + "\n", + "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", + "\n", + "BUCKET_NAME = \"[your-bucket-name]\" # @param {type: \"string\", placeholder: \"[your-bucket-name]\", isTemplate: true}\n", + "BUCKET_URI = f\"gs://evaluate_agents\"\n", + "\n", + "if not BUCKET_NAME or BUCKET_NAME == \"[your-bucket-name]\":\n", + " BUCKET_NAME = f\"{PROJECT_ID}-bucket\"\n", + "\n", + "! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI\n", + "\n", + "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", + "\n", + "vertexai.init(\n", + " project=PROJECT_ID,\n", + " location=LOCATION,\n", + " staging_bucket=BUCKET_URI,\n", + " experiment=EXPERIMENT_NAME,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5303c05f7aa6" + }, + "source": [ + "## Import libraries\n", + "\n", + "Import tutorial libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6fc324893334" + }, + "outputs": [], + "source": [ + "# General\n", + "import random\n", + "import string\n", + "\n", + "from IPython.display import HTML, Markdown, display\n", + "\n", + "# Build agent\n", + "from google.cloud import aiplatform\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "from vertexai.preview import reasoning_engines\n", + "\n", + "# Evaluate agent\n", + "from vertexai.preview.evaluation import EvalTask\n", + "from vertexai.preview.evaluation.metrics import (\n", + " PointwiseMetric,\n", + " PointwiseMetricPromptTemplate,\n", + " TrajectorySingleToolUse,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MVnBDX54gz7j" + }, + "source": [ + "## Define helper functions\n", + "\n", + "Initiate a set of helper functions to print tutorial results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uSgWjMD_g1_v" + }, + "outputs": [], + "source": [ + "def get_id(length: int = 8) -> str:\n", + " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", + " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", + "\n", + "\n", + "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", + " \"\"\"Display the evaluation results.\"\"\"\n", + " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", + " display(Markdown(\"### Summary Metrics\"))\n", + " display(metrics_df)\n", + "\n", + " display(Markdown(f\"### Row-wise Metrics\"))\n", + " display(eval_result.metrics_table)\n", + "\n", + "\n", + "def display_drilldown(row: pd.Series) -> None:\n", + " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", + "\n", + " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", + "\n", + " if not (\n", + " isinstance(row[\"predicted_trajectory\"], list)\n", + " and isinstance(row[\"reference_trajectory\"], list)\n", + " ):\n", + " return\n", + "\n", + " for predicted_trajectory, reference_trajectory in zip(\n", + " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", + " ):\n", + " display(\n", + " HTML(\n", + " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", + " )\n", + " )\n", + "\n", + " if not (\n", + " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", + " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", + " ):\n", + " continue\n", + "\n", + " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", + " print(\"Tool Input Key: \", tool_input_key)\n", + "\n", + " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " reference_trajectory[\"tool_input\"][tool_input_key],\n", + " )\n", + " else:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " \"N/A\",\n", + " )\n", + " print(\"\\n\")\n", + " display(HTML(\"
\"))\n", + "\n", + "\n", + "def display_dataframe_rows(\n", + " df: pd.DataFrame,\n", + " columns: list[str] | None = None,\n", + " num_rows: int = 3,\n", + " display_drilldown: bool = False,\n", + ") -> None:\n", + " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", + "\n", + " if columns:\n", + " df = df[columns]\n", + "\n", + " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", + " header_style = base_style + \"font-weight: bold;\"\n", + "\n", + " for _, row in df.head(num_rows).iterrows():\n", + " for column in df.columns:\n", + " display(\n", + " HTML(\n", + " f\"{column.replace('_', ' ').title()}: \"\n", + " )\n", + " )\n", + " display(HTML(f\"{row[column]}
\"))\n", + "\n", + " display(HTML(\"
\"))\n", + "\n", + " if (\n", + " display_drilldown\n", + " and \"predicted_trajectory\" in df.columns\n", + " and \"reference_trajectory\" in df.columns\n", + " ):\n", + " display_drilldown(row)\n", + "\n", + "\n", + "def plot_bar_plot(\n", + " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", + ") -> None:\n", + " fig = go.Figure()\n", + " data = []\n", + "\n", + " summary_metrics = eval_result.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " data.append(\n", + " go.Bar(\n", + " x=list(summary_metrics.keys()),\n", + " y=list(summary_metrics.values()),\n", + " name=title,\n", + " )\n", + " )\n", + "\n", + " fig = go.Figure(data=data)\n", + "\n", + " # Change the bar mode\n", + " fig.update_layout(barmode=\"group\")\n", + " fig.show()\n", + "\n", + "\n", + "def display_radar_plot(eval_results, title: str, metrics=None):\n", + " \"\"\"Plot the radar plot.\"\"\"\n", + " fig = go.Figure()\n", + " summary_metrics = eval_results.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " min_val = min(summary_metrics.values())\n", + " max_val = max(summary_metrics.values())\n", + "\n", + " fig.add_trace(\n", + " go.Scatterpolar(\n", + " r=list(summary_metrics.values()),\n", + " theta=list(summary_metrics.keys()),\n", + " fill=\"toself\",\n", + " name=title,\n", + " )\n", + " )\n", + " fig.update_layout(\n", + " title=title,\n", + " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", + " showlegend=True,\n", + " )\n", + " fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bDaa2Mtsifmq" + }, + "source": [ + "## Build and deploy a LangChain agent using Vertex AI Reasoning Engine's prebuilt template\n", + "\n", + "Build and deploy your application using LangChain, including the Gemini model and custom tools that you define.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KHwShhpOitKp" + }, + "source": [ + "### Set tools\n", + "\n", + "To start, create some tools the agent will need to do their job. We are just going to pretend there's a database for this colab, but you would wire into your database or third party system for a real agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gA2ZKvfeislw" + }, + "outputs": [], + "source": [ + "def get_product_details(product_name: str):\n", + " \"\"\"Gathers basic details about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", + " \"usb charger\": \"A super fast and light usb charger\",\n", + " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", + " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", + " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " }\n", + " return details.get(product_name, \"Product details not found.\")\n", + "\n", + "\n", + "def get_product_price(product_name: str):\n", + " \"\"\"Gathers price about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": 500,\n", + " \"usb charger\": 10,\n", + " \"shoes\": 100,\n", + " \"headphones\": 50,\n", + " \"speaker\": 80,\n", + " }\n", + " return details.get(product_name, \"Product price not found.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l4mk5XPui4Y1" + }, + "source": [ + "### Set the model\n", + "\n", + "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BaYeo6K2i-w1" + }, + "outputs": [], + "source": [ + "model = \"gemini-1.5-pro\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNlAY9cojEWz" + }, + "source": [ + "### Assemble the agent\n", + "\n", + "To create a LangChain agent using [Vertex AI Reasoning Engine](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy), use the LangchainAgent class. This class helps you quickly get an agent running with a standard template. Think of it as a shortcut for building agents – you don't have to start from scratch. The LangchainAgent handles the basic structure and initial configuration, allowing you to get right into using the agent.\n", + "\n", + "> Note the additional parameter `agent_executor_kwargs` which would allow to return tool calls made by agent so you can evaluate them.\n", + "\n", + "The Vertex AI GenAI Evaluation works directly with 'Queryable' agents (like in this case), and also lets you add your own custom functions with a specific structure (signature)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dAFdi7SujGP8" + }, + "outputs": [], + "source": [ + "local_1p_agent = reasoning_engines.LangchainAgent(\n", + " model=model,\n", + " tools=[get_product_details, get_product_price],\n", + " agent_executor_kwargs={\"return_intermediate_steps\": True},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_HGcs6PVjRj_" + }, + "source": [ + "### Test the local agent\n", + "\n", + "Query your agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lGb58OJkjUs9" + }, + "outputs": [], + "source": [ + "response = local_1p_agent.query(input=\"Get product details for shoes\")\n", + "display(Markdown(response[\"output\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "INqf60zPWP6L" + }, + "outputs": [], + "source": [ + "response = local_1p_agent.query(input=\"Get product price for shoes\")\n", + "display(Markdown(response[\"output\"]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dP5g16W1rzMI" + }, + "source": [ + "### Deploy the local agent to Vertex AI Reasoning Engine\n", + "\n", + "To deploy the local agent on Vertex AI Reasoning Engine, you can use the `create` method by passing the agent and some specify dependencies (`requirements` for external PyPI packages and `extra_packages` for local packages ).\n", + "\n", + "Look at [Deploy the application](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy#create_a_reasoningengine_instance) documentation page to learn more. \n", + "\n", + "> The agent deployment on Vertex AI Reasoning Engine would require ~ 10 mins." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GPNpD676r6T2" + }, + "outputs": [], + "source": [ + "remote_1p_agent = reasoning_engines.ReasoningEngine.create(\n", + " local_1p_agent,\n", + " requirements=[\n", + " \"google-cloud-aiplatform[langchain,reasoningengine]\",\n", + " \"langchain_google_vertexai\",\n", + " \"cloudpickle==3.0.0\",\n", + " \"pydantic==2.7.4\",\n", + " \"requests\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GjZMd82vHRh3" + }, + "source": [ + "### Test the remote agent\n", + "\n", + "Query your remote agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KSCznbhbHRh3" + }, + "outputs": [], + "source": [ + "response = remote_1p_agent.query(input=\"Get product details for shoes\")\n", + "display(Markdown(response[\"output\"]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aOGPePsorpUl" + }, + "source": [ + "## Evaluating an agent with Vertex AI Gen AI Evaluation\n", + "\n", + "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", + "\n", + "Monitoring focuses on how well your agent is performing specific tasks:\n", + "\n", + "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", + "\n", + "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", + "\n", + "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", + "\n", + "Observability is about understanding the overall health of the agent:\n", + "\n", + "* **Latency**: How long does it take the agent to respond?\n", + "\n", + "* **Failure Rate**: How often does the agent fail to produce a response?\n", + "\n", + "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e43229f3ad4f" + }, + "source": [ + "### Prepare Agent Evaluation dataset\n", + "\n", + "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", + "\n", + "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", + "\n", + "> Optionally, you can provide both generated responses and predicted trajectory (**bring-your-own-dataset scenario**).\n", + "\n", + "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fFf8uTdUiDt3" + }, + "outputs": [], + "source": [ + "eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + "}\n", + "\n", + "eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PQEI1EcfvFHb" + }, + "source": [ + "Print some samples from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EjsonqWWvIvE" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(eval_sample_dataset, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m4CvBuf1afHG" + }, + "source": [ + "### Single tool usage evaluation\n", + "\n", + "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_rS5GGKHd5bx" + }, + "source": [ + "#### Set single tool usage metrics\n", + "\n", + "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", + "\n", + "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xixvq8dwd5by" + }, + "outputs": [], + "source": [ + "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ktKZoT2Qd5by" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using the remote agent and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QaMf9dqzySE6" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SRv43fDcd5by" + }, + "outputs": [], + "source": [ + "single_tool_call_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=single_tool_usage_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", + " runnable=remote_1p_agent, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(single_tool_call_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6o5BjSTFKVMS" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Use some helper functions to visualize a sample of evaluation result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1Jopzw83k14w" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JlujdJpu5Kn6" + }, + "source": [ + "### Trajectory Evaluation\n", + "\n", + "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8s-nHdDJneHM" + }, + "source": [ + "#### Set trajectory metrics\n", + "\n", + "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", + "\n", + "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", + "\n", + "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", + "\n", + "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", + "\n", + "* `trajectory_precision`: proportion of predicted actions present in reference\n", + "\n", + "* `trajectory_recall`: proportion of reference actions present in predicted. \n", + "\n", + "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c32WIS95neHN" + }, + "outputs": [], + "source": [ + "trajectory_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"trajectory_any_order_match\",\n", + " \"trajectory_precision\",\n", + " \"trajectory_recall\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF3jhTH3neHN" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vOdS7TJUneHN" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"trajectory-{get_id()}\"\n", + "\n", + "trajectory_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=remote_1p_agent)\n", + "\n", + "display_eval_report(trajectory_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DBiUI3LyLBtj" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Print and visualize a sample of evaluation results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sLVRdN5llA0h" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PrxM5sMZYXHP" + }, + "outputs": [], + "source": [ + "plot_bar_plot(\n", + " trajectory_eval_result,\n", + " title=\"Trajectory Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T8TipU2akHEd" + }, + "source": [ + "### Evaluate final response\n", + "\n", + "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DeK-py7ykkDN" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", + "\n", + "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cyGHGgeVklvz" + }, + "outputs": [], + "source": [ + "response_metrics = [\"safety\", \"coherence\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DaBJWcg1kn55" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wRb2EC_hknSD" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"response-{get_id()}\"\n", + "\n", + "response_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "response_eval_result = response_eval_task.evaluate(runnable=remote_1p_agent)\n", + "\n", + "display_eval_report(response_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JtewTwiwg9qH" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZODTRuq2lF75" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ntRBK3Te6PEc" + }, + "source": [ + "### Evaluate generated response conditioned by tool choosing.\n", + "\n", + "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", + "\n", + "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4bENwFcd6prX" + }, + "source": [ + "#### Define a custom metric\n", + "\n", + "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", + "\n", + "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "txGEHcg76riI" + }, + "outputs": [], + "source": [ + "criteria = {\n", + " \"Follows trajectory\": (\n", + " \"Evaluate whether the agent's response logically follows from the \"\n", + " \"sequence of actions it took. Consider these sub-points:\\n\"\n", + " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", + " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", + " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", + " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", + " )\n", + "}\n", + "\n", + "pointwise_rating_rubric = {\n", + " \"1\": \"Follows trajectory\",\n", + " \"0\": \"Does not follow trajectory\",\n", + "}\n", + "\n", + "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", + " criteria=criteria,\n", + " rating_rubric=pointwise_rating_rubric,\n", + " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8MJqXu0kikxd" + }, + "source": [ + "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5EL7iEDMikNQ" + }, + "outputs": [], + "source": [ + "print(response_follows_trajectory_prompt_template.prompt_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e1djVp7Fi4Yy" + }, + "source": [ + "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nx1xbZD87iMj" + }, + "outputs": [], + "source": [ + "response_follows_trajectory_metric = PointwiseMetric(\n", + " metric=\"response_follows_trajectory\",\n", + " metric_prompt_template=response_follows_trajectory_prompt_template,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1pmxLwTe7Ywv" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "Set new generated response evaluation metrics by including the custom metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wrsbVFDd7Ywv" + }, + "outputs": [], + "source": [ + "response_tool_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"safety\",\n", + " response_follows_trajectory_metric,\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lo-Sza807Ywv" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_dkb4gSn7Ywv" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"response-over-tools-{get_id()}\"\n", + "\n", + "response_eval_tool_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "response_eval_tool_result = response_eval_tool_task.evaluate(runnable=remote_1p_agent)\n", + "\n", + "display_eval_report(response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AtOfIFi2j88g" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GH2YvXgLlLH7" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4nuUDP3a2eTB" + }, + "source": [ + "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", + "\n", + "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pNh3-NDuZGDl" + }, + "source": [ + "### Bring your own evaluation dataset\n", + "\n", + "Define the evaluation dataset with the predicted trajectory and the generated response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y9hBgsg324Ej" + }, + "outputs": [], + "source": [ + "byod_eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"generated_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"response\": [\n", + " 500,\n", + " 50,\n", + " \"A super fast and light usb charger\",\n", + " 100,\n", + " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " ],\n", + "}\n", + "\n", + "byod_eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oEYmU2eJ7q-1" + }, + "source": [ + "### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wBD-4wpB7q-3" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", + "\n", + "byod_response_eval_tool_task = EvalTask(\n", + " dataset=byod_eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", + " runnable=remote_1p_agent, experiment_run_name=EXPERIMENT_RUN_NAME\n", + ")\n", + "\n", + "display_eval_report(byod_response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9eU3LG6r7q-3" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pQFzmd2I7q-3" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0FEbvEOkZS8f" + }, + "outputs": [], + "source": [ + "display_radar_plot(\n", + " byod_response_eval_tool_result,\n", + " title=\"Response Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2a4e033321ad" + }, + "source": [ + "## Cleaning up\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ox2I3UfRlTOd" + }, + "outputs": [], + "source": [ + "delete_experiment = True\n", + "delete_remote_agent = True\n", + "\n", + "if delete_experiment:\n", + " try:\n", + " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", + " experiment.delete(delete_backing_tensorboard_runs=True)\n", + " except Exception as e:\n", + " print(e)\n", + "\n", + "if delete_remote_agent:\n", + " try:\n", + " remote_1p_agent.delete()\n", + " except Exception as e:\n", + " print(e)" + ] + } + ], + "metadata": { + "colab": { + "name": "evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb", + "toc_visible": true + }, + "environment": { + "kernel": "python3", + "name": "common-cpu.m126", + "type": "gcloud", + "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/base-cpu:m126" + }, + "kernelspec": { + "display_name": "Python 3 (Local)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } diff --git a/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb b/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb index 9b8b769671..144c7edec4 100644 --- a/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb +++ b/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb @@ -1,9100 +1,1712 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ur8xi4C7S06n" - }, - "outputs": [], - "source": [ - "# Copyright 2024 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JAPoU8Sm5E6e" - }, - "source": [ - "# Evaluate an LangGraph agent on Vertex AI Reasoning Engine (Customized template)\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \"Google
Open in Colab\n", - "
\n", - "
\n", - " \n", - " \"Google
Open in Colab Enterprise\n", - "
\n", - "
\n", - " \n", - " \"Vertex
Open in Vertex AI Workbench\n", - "
\n", - "
\n", - " \n", - " \"GitHub
View on GitHub\n", - "
\n", - "
\n", - "\n", - "
\n", - "\n", - "Share to:\n", - "\n", - "\n", - " \"LinkedIn\n", - "\n", - "\n", - "\n", - " \"Bluesky\n", - "\n", - "\n", - "\n", - " \"X\n", - "\n", - "\n", - "\n", - " \"Reddit\n", - "\n", - "\n", - "\n", - " \"Facebook\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "84f0f73a0f76" - }, - "source": [ - "| | |\n", - "|-|-|\n", - "| Authors | [Naveksha Sood](https://github.com/navekshasood) [Ivan Nardini](https://github.com/inardini) |" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tvgnzT1CKxrO" - }, - "source": [ - "## Overview\n", - "\n", - "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", - "\n", - "This tutorial shows how to evaluate an LangGraph agent with customized template on Vertex AI Reasoning Engine using Vertex AI Gen AI Evaluation.\n", - "\n", - "The tutorial uses the following Google Cloud services and resources:\n", - "\n", - "* Vertex AI Gen AI Evaluation\n", - "* Vertex AI Reasoning Engine\n", - "\n", - "The steps performed include:\n", - "\n", - "* Build and deploy an agent using LangGraph on Vertex AI Reasoning Engine\n", - "* Prepare Agent Evaluation dataset\n", - "* Single tool usage evaluation\n", - "* Trajectory evaluation\n", - "* Response evaluation\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "61RBz8LLbxCR" - }, - "source": [ - "## Get started" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "No17Cw5hgx12" - }, - "source": [ - "### Install Vertex AI SDK and other required packages\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tFy3H3aPgx12" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.0/42.0 kB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.3/67.3 kB\u001b[0m \u001b[31m4.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", - " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", - " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m48.5/48.5 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m192.0/192.0 kB\u001b[0m \u001b[31m7.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m468.0/468.0 kB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m131.8/131.8 kB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m628.3/628.3 kB\u001b[0m \u001b[31m15.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m22.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m147.8/147.8 kB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m211.4/211.4 kB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.1/53.1 kB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m29.9/29.9 MB\u001b[0m \u001b[31m31.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m33.5/33.5 MB\u001b[0m \u001b[31m14.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.5/6.5 MB\u001b[0m \u001b[31m72.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.8/55.8 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.2/59.2 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m83.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m319.7/319.7 kB\u001b[0m \u001b[31m19.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.6/57.6 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m756.0/756.0 kB\u001b[0m \u001b[31m38.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.7/9.7 MB\u001b[0m \u001b[31m76.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.0/15.0 MB\u001b[0m \u001b[31m55.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m233.5/233.5 kB\u001b[0m \u001b[31m13.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m278.6/278.6 kB\u001b[0m \u001b[31m16.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m249.9/249.9 kB\u001b[0m \u001b[31m14.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m94.8/94.8 kB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m131.6/131.6 kB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m327.6/327.6 kB\u001b[0m \u001b[31m17.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.9/1.9 MB\u001b[0m \u001b[31m63.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.3/44.3 kB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.5/2.5 MB\u001b[0m \u001b[31m62.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.7/50.7 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m311.8/311.8 kB\u001b[0m \u001b[31m18.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m83.2/83.2 kB\u001b[0m \u001b[31m5.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m93.2/93.2 kB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.3/13.3 MB\u001b[0m \u001b[31m51.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.8/54.8 kB\u001b[0m \u001b[31m3.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m298.0/298.0 kB\u001b[0m \u001b[31m16.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.8/2.8 MB\u001b[0m \u001b[31m56.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m71.1/71.1 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m41.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.0/3.0 MB\u001b[0m \u001b[31m65.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m481.7/481.7 kB\u001b[0m \u001b[31m23.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.3/62.3 kB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m56.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m442.1/442.1 kB\u001b[0m \u001b[31m21.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m42.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m209.0/209.0 kB\u001b[0m \u001b[31m13.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m267.2/267.2 kB\u001b[0m \u001b[31m14.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m73.2/73.2 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.8/3.8 MB\u001b[0m \u001b[31m57.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m443.8/443.8 kB\u001b[0m \u001b[31m26.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.6/78.6 kB\u001b[0m \u001b[31m5.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m49.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m6.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.5/49.5 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.5/57.5 kB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Building wheel for docx2txt (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Building wheel for pypika (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[33m WARNING: The script uvicorn is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script pytube is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script dotenv is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script pypdfium2 is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script nodeenv is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script mako-render is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script json_repair is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script humanfriendly is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script fastavro is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script watchfiles is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The scripts pyright, pyright-langserver, pyright-python and pyright-python-langserver are installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script gptcache_server is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script coloredlogs is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script pyproject-build is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script alembic is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script onnxruntime_test is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script langsmith is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script fastapi is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script pdfplumber is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script litellm is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script instructor is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script chroma is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script ec is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script crewai is installed in '/root/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "tensorflow 2.17.1 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 5.29.1 which is incompatible.\n", - "tensorflow-metadata 1.13.1 requires protobuf<5,>=3.20.3, but you have protobuf 5.29.1 which is incompatible.\n", - "transformers 4.47.0 requires tokenizers<0.22,>=0.21, but you have tokenizers 0.20.3 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0m" - ] - } - ], - "source": [ - "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation, langchain, reasoningengine]\" \\\n", - " \"langchain_google_vertexai\" \\\n", - " \"langgraph\" \\\n", - " \"cloudpickle==3.0.0\" \\\n", - " \"pydantic==2.7.4\" \\\n", - " \"requests\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "R5Xep4W9lq-Z" - }, - "source": [ - "### Restart runtime\n", - "\n", - "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", - "\n", - "The restart might take a minute or longer. After it's restarted, continue to the next step." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XRvKdaPDTznN" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'status': 'ok', 'restart': True}" - ] - }, - "execution_count": 58, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import IPython\n", - "\n", - "app = IPython.Application.instance()\n", - "app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SbmM4z7FOBpM" - }, - "source": [ - "
\n", - "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", - "
\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dmWOrTJ3gx13" - }, - "source": [ - "### Authenticate your notebook environment (Colab only)\n", - "\n", - "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NyKGtVQjgx13" - }, - "outputs": [], - "source": [ - "import sys\n", - "\n", - "if \"google.colab\" in sys.modules:\n", - " from google.colab import auth\n", - "\n", - " auth.authenticate_user()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF4l8DTdWgPY" - }, - "source": [ - "### Set Google Cloud project information and initialize Vertex AI SDK\n", - "\n", - "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", - "\n", - "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nqwi-5ufWp_B" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Creating gs://evaluate_agents/...\n", - "ServiceException: 409 A Cloud Storage bucket named 'evaluate_agents' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Use the environment variable if the user doesn't provide Project ID.\n", - "import os\n", - "\n", - "import vertexai\n", - "\n", - "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", - "\n", - "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", - " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", - "\n", - "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", - "\n", - "BUCKET_NAME = \"[your-bucket-name]\" # @param {type: \"string\", placeholder: \"[your-bucket-name]\", isTemplate: true}\n", - "\n", - "if not BUCKET_NAME or BUCKET_NAME == \"[your-bucket-name]\":\n", - " BUCKET_NAME = f\"{PROJECT_ID}-bucket\"\n", - "\n", - "BUCKET_URI = f\"gs://evaluate_agents\"\n", - "\n", - "! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI\n", - "\n", - "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", - "\n", - "vertexai.init(\n", - " project=PROJECT_ID,\n", - " location=LOCATION,\n", - " staging_bucket=BUCKET_URI,\n", - " experiment=EXPERIMENT_NAME,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5303c05f7aa6" - }, - "source": [ - "## Import libraries\n", - "\n", - "Import tutorial libraries." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6fc324893334" - }, - "outputs": [], - "source": [ - "# General\n", - "import random\n", - "import string\n", - "from typing import Literal\n", - "\n", - "from IPython.display import HTML, Markdown, display\n", - "\n", - "# Build agent\n", - "from crewai_tools import tool\n", - "\n", - "# Evaluate agent\n", - "from google.cloud import aiplatform\n", - "import pandas as pd\n", - "import plotly.graph_objects as go\n", - "from vertexai.preview import reasoning_engines\n", - "from vertexai.preview.evaluation import EvalTask\n", - "from vertexai.preview.evaluation.metrics import (\n", - " PointwiseMetric,\n", - " PointwiseMetricPromptTemplate,\n", - " TrajectorySingleToolUse,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MVnBDX54gz7j" - }, - "source": [ - "## Define helper functions\n", - "\n", - "Initiate a set of helper functions to print tutorial results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uSgWjMD_g1_v" - }, - "outputs": [], - "source": [ - "def get_id(length: int = 8) -> str:\n", - " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", - " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", - "\n", - "\n", - "def parse_crewai_output_to_dictionary(crew, crew_output):\n", - " \"\"\"\n", - " Parse CrewAI output into a structured dictionary format.\n", - " \"\"\"\n", - " final_output = {\"response\": str(crew_output), \"predicted_trajectory\": []}\n", - "\n", - " try:\n", - " # Access tools_results directly from each agent\n", - " for agent in crew.agents:\n", - " if hasattr(agent, \"tools_results\"):\n", - " for tool_result in agent.tools_results:\n", - " tool_info = {\n", - " \"tool_name\": tool_result.get(\"tool_name\", \"\"),\n", - " \"tool_input\": tool_result.get(\"tool_args\", {}),\n", - " }\n", - " final_output[\"predicted_trajectory\"].append(tool_info)\n", - "\n", - " except Exception as e:\n", - " final_output[\"error\"] = f\"Error parsing tools results: {str(e)}\"\n", - "\n", - " return final_output\n", - "\n", - "\n", - "def format_output_as_markdown(output: dict) -> str:\n", - " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", - " markdown = \"### AI Response\\n\"\n", - " markdown += f\"{output['response']}\\n\\n\"\n", - "\n", - " if output[\"predicted_trajectory\"]:\n", - " markdown += \"### Function Calls\\n\"\n", - " for call in output[\"predicted_trajectory\"]:\n", - " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", - " markdown += \" - **Arguments**:\\n\"\n", - " for key, value in call[\"tool_input\"].items():\n", - " markdown += f\" - `{key}`: `{value}`\\n\"\n", - "\n", - " return markdown\n", - "\n", - "\n", - "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", - " \"\"\"Display the evaluation results.\"\"\"\n", - " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", - " display(Markdown(\"### Summary Metrics\"))\n", - " display(metrics_df)\n", - "\n", - " display(Markdown(f\"### Row-wise Metrics\"))\n", - " display(eval_result.metrics_table)\n", - "\n", - "\n", - "def display_drilldown(row: pd.Series) -> None:\n", - " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", - "\n", - " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", - "\n", - " if not (\n", - " isinstance(row[\"predicted_trajectory\"], list)\n", - " and isinstance(row[\"reference_trajectory\"], list)\n", - " ):\n", - " return\n", - "\n", - " for predicted_trajectory, reference_trajectory in zip(\n", - " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", - " ):\n", - " display(\n", - " HTML(\n", - " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", - " )\n", - " )\n", - "\n", - " if not (\n", - " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", - " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", - " ):\n", - " continue\n", - "\n", - " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", - " print(\"Tool Input Key: \", tool_input_key)\n", - "\n", - " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", - " print(\n", - " \"Tool Values: \",\n", - " predicted_trajectory[\"tool_input\"][tool_input_key],\n", - " reference_trajectory[\"tool_input\"][tool_input_key],\n", - " )\n", - " else:\n", - " print(\n", - " \"Tool Values: \",\n", - " predicted_trajectory[\"tool_input\"][tool_input_key],\n", - " \"N/A\",\n", - " )\n", - " print(\"\\n\")\n", - " display(HTML(\"
\"))\n", - "\n", - "\n", - "def display_dataframe_rows(\n", - " df: pd.DataFrame,\n", - " columns: list[str] | None = None,\n", - " num_rows: int = 3,\n", - " display_drilldown: bool = False,\n", - ") -> None:\n", - " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", - "\n", - " if columns:\n", - " df = df[columns]\n", - "\n", - " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", - " header_style = base_style + \"font-weight: bold;\"\n", - "\n", - " for _, row in df.head(num_rows).iterrows():\n", - " for column in df.columns:\n", - " display(\n", - " HTML(\n", - " f\"{column.replace('_', ' ').title()}: \"\n", - " )\n", - " )\n", - " display(HTML(f\"{row[column]}
\"))\n", - "\n", - " display(HTML(\"
\"))\n", - "\n", - " if (\n", - " display_drilldown\n", - " and \"predicted_trajectory\" in df.columns\n", - " and \"reference_trajectory\" in df.columns\n", - " ):\n", - " display_drilldown(row)\n", - "\n", - "\n", - "def plot_bar_plot(\n", - " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", - ") -> None:\n", - " fig = go.Figure()\n", - " data = []\n", - "\n", - " summary_metrics = eval_result.summary_metrics\n", - " if metrics:\n", - " summary_metrics = {\n", - " k: summary_metrics[k]\n", - " for k, v in summary_metrics.items()\n", - " if any(selected_metric in k for selected_metric in metrics)\n", - " }\n", - "\n", - " data.append(\n", - " go.Bar(\n", - " x=list(summary_metrics.keys()),\n", - " y=list(summary_metrics.values()),\n", - " name=title,\n", - " )\n", - " )\n", - "\n", - " fig = go.Figure(data=data)\n", - "\n", - " # Change the bar mode\n", - " fig.update_layout(barmode=\"group\")\n", - " fig.show()\n", - "\n", - "\n", - "def display_radar_plot(eval_results, title: str, metrics=None):\n", - " \"\"\"Plot the radar plot.\"\"\"\n", - " fig = go.Figure()\n", - " summary_metrics = eval_results.summary_metrics\n", - " if metrics:\n", - " summary_metrics = {\n", - " k: summary_metrics[k]\n", - " for k, v in summary_metrics.items()\n", - " if any(selected_metric in k for selected_metric in metrics)\n", - " }\n", - "\n", - " min_val = min(summary_metrics.values())\n", - " max_val = max(summary_metrics.values())\n", - "\n", - " fig.add_trace(\n", - " go.Scatterpolar(\n", - " r=list(summary_metrics.values()),\n", - " theta=list(summary_metrics.keys()),\n", - " fill=\"toself\",\n", - " name=title,\n", - " )\n", - " )\n", - " fig.update_layout(\n", - " title=title,\n", - " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", - " showlegend=True,\n", - " )\n", - " fig.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bDaa2Mtsifmq" - }, - "source": [ - "## Build an agent using Vertex AI Reasoning Engine's customized template\n", - "\n", - "Build and deploy your application using LangGraph, including the Gemini model and custom tools that you define.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KHwShhpOitKp" - }, - "source": [ - "### Set tools\n", - "\n", - "To start, set the tools that a customer support agent needs to do their job." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gA2ZKvfeislw" - }, - "outputs": [], - "source": [ - "@tool\n", - "def get_product_details(product_name: str):\n", - " \"\"\"Gathers basic details about a product.\"\"\"\n", - " details = {\n", - " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", - " \"usb charger\": \"A super fast and light usb charger\",\n", - " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", - " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", - " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", - " }\n", - " return details.get(product_name, \"Product details not found.\")\n", - "\n", - "\n", - "@tool\n", - "def get_product_price(product_name: str):\n", - " \"\"\"Gathers price about a product.\"\"\"\n", - " details = {\n", - " \"smartphone\": 500,\n", - " \"usb charger\": 10,\n", - " \"shoes\": 100,\n", - " \"headphones\": 50,\n", - " \"speaker\": 80,\n", - " }\n", - " return details.get(product_name, \"Product price not found.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "be70714d9fae" - }, - "source": [ - "### Define router\n", - "\n", - "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "516b5108d327" - }, - "outputs": [], - "source": [ - "def router(\n", - " state: list[BaseMessage],\n", - ") -> Literal[\"get_product_details\", \"get_product_price\", \"__end__\"]:\n", - " \"\"\"Initiates product details or price retrieval if the user asks for a product.\"\"\"\n", - " # Get the tool_calls from the last message in the conversation history.\n", - " tool_calls = state[-1].tool_calls\n", - "\n", - " # If there are any tool_calls\n", - " if tool_calls:\n", - " # Check the function name in the first tool call\n", - " function_name = tool_calls[0].get(\"name\")\n", - " if function_name == \"get_product_price\":\n", - " return \"get_product_price\"\n", - " else:\n", - " return \"get_product_details\"\n", - " else:\n", - " # End the conversation flow.\n", - " return \"__end__\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FHjhBVx2cHWb" - }, - "source": [ - "### Set the model\n", - "\n", - "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "iCx9hbpccHWc" - }, - "outputs": [], - "source": [ - "model = \"gemini-1.5-pro\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tNlAY9cojEWz" - }, - "source": [ - "### Assemble the agent\n", - "\n", - "To create a LangGraph agent using [Vertex AI Reasoning Engine](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy), use the [customized template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize).\n", - "\n", - "This class helps you quickly get an agent using any framework running Vertex AI Reasoning Engine.\n", - "\n", - "To learn more about the template, check out [Customize an application template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize) documentation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dAFdi7SujGP8" - }, - "outputs": [], - "source": [ - "class LangGraphApp:\n", - " def __init__(self, project: str, location: str, model: str = model) -> None:\n", - " self.project_id = project\n", - " self.location = location\n", - " self.model = model\n", - "\n", - " # The set_up method is used to define application initialization logic\n", - " def set_up(self) -> None:\n", - " model = ChatVertexAI(model=self.model)\n", - " builder = MessageGraph()\n", - "\n", - " model_with_tools = model.bind_tools([get_product_details, get_product_price])\n", - " builder.add_node(\"tools\", model_with_tools)\n", - "\n", - " tool_node = ToolNode([get_product_details, get_product_price])\n", - " builder.add_node(\"get_product_details\", tool_node)\n", - " builder.add_node(\"get_product_price\", tool_node)\n", - " builder.add_edge(\"get_product_details\", END)\n", - " builder.add_edge(\"get_product_price\", END)\n", - "\n", - " builder.set_entry_point(\"tools\")\n", - " builder.add_conditional_edges(\"tools\", router)\n", - " self.app = builder.compile()\n", - "\n", - " # The query method will be used to send inputs to the agent\n", - " def query(self, input: str):\n", - " \"\"\"Query the application.\"\"\"\n", - " chat_history = langchain_load_dump.dumpd(self.app.invoke(HumanMessage(input)))\n", - " return chat_history\n", - " # return {'output': parse_messages_to_output_dictionary(chat_history)}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_HGcs6PVjRj_" - }, - "source": [ - "### Test the local agent\n", - "\n", - "Query your agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1dXLLgBudu_L" - }, - "outputs": [], - "source": [ - "local_custom_agent = LangGraphApp(project=PROJECT_ID, location=LOCATION)\n", - "local_custom_agent.set_up()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PgkOhPmN3aCZ" - }, - "outputs": [ - { - "data": { - "text/markdown": [ - "### AI Response\n", - "High-performance running shoes designed for comfort, support, and speed.\n", - "\n", - "### Function Calls\n", - "- **Function**: `get_product_details`\n", - " - **Arguments**:\n", - " - `product_name`: `shoes`\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "response = local_custom_agent.query(input=\"Get product details for shoes\")\n", - "display(\n", - " Markdown(format_output_as_markdown(parse_messages_to_output_dictionary(response)))\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "lGb58OJkjUs9" - }, - "outputs": [ - { - "data": { - "text/markdown": [ - "### AI Response\n", - "100\n", - "\n", - "### Function Calls\n", - "- **Function**: `get_product_price`\n", - " - **Arguments**:\n", - " - `product_name`: `shoes`\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "response = local_custom_agent.query(input=\"Get product price for shoes\")\n", - "display(\n", - " Markdown(format_output_as_markdown(parse_messages_to_output_dictionary(response)))\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2pSItXD5e4QD" - }, - "source": [ - "### Deploy the local agent to Vertex AI Reasoning Engine\n", - "\n", - "To deploy the local agent on Vertex AI Reasoning Engine, you can use the `create` method by passing the agent and some specify dependencies (`requirements` for external PyPI packages and `extra_packages` for local packages ).\n", - "\n", - "Look at [Deploy the application](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy#create_a_reasoningengine_instance) documentation page to learn more. \n", - "\n", - "> The agent deployment on Vertex AI Reasoning Engine would require ~ 10 mins." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "3HLz_a1We4QE" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:vertexai.reasoning_engines._reasoning_engines:Using bucket evaluate_agents\n", - "INFO:vertexai.reasoning_engines._reasoning_engines:Writing to gs://evaluate_agents/reasoning_engine/reasoning_engine.pkl\n", - "INFO:vertexai.reasoning_engines._reasoning_engines:Writing to gs://evaluate_agents/reasoning_engine/requirements.txt\n", - "INFO:vertexai.reasoning_engines._reasoning_engines:Creating in-memory tarfile of extra_packages\n", - "INFO:vertexai.reasoning_engines._reasoning_engines:Writing to gs://evaluate_agents/reasoning_engine/dependencies.tar.gz\n", - "INFO:vertexai.reasoning_engines._reasoning_engines:Creating ReasoningEngine\n", - "INFO:vertexai.reasoning_engines._reasoning_engines:Create ReasoningEngine backing LRO: projects/801452371447/locations/us-central1/reasoningEngines/1480048204102762496/operations/5878089664325222400\n", - "INFO:vertexai.reasoning_engines._reasoning_engines:ReasoningEngine created. Resource name: projects/801452371447/locations/us-central1/reasoningEngines/1480048204102762496\n", - "INFO:vertexai.reasoning_engines._reasoning_engines:To use this ReasoningEngine in another session:\n", - "INFO:vertexai.reasoning_engines._reasoning_engines:reasoning_engine = vertexai.preview.reasoning_engines.ReasoningEngine('projects/801452371447/locations/us-central1/reasoningEngines/1480048204102762496')\n" - ] - } - ], - "source": [ - "local_custom_agent = LangGraphApp(project=PROJECT_ID, location=LOCATION)\n", - "\n", - "remote_custom_agent = reasoning_engines.ReasoningEngine.create(\n", - " local_custom_agent,\n", - " requirements=[\n", - " \"google-cloud-aiplatform[langchain,reasoningengine]\",\n", - " \"langchain_google_vertexai\",\n", - " \"langgraph\",\n", - " \"cloudpickle==3.0.0\",\n", - " \"pydantic==2.7.4\",\n", - " \"requests\",\n", - " ],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nu4RO1P9e4QE" - }, - "source": [ - "### Test the remote agent\n", - "\n", - "Query your remote agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sqBtzYyce4QE" - }, - "outputs": [ - { - "data": { - "text/markdown": [ - "### AI Response\n", - "High-performance running shoes designed for comfort, support, and speed.\n", - "\n", - "### Function Calls\n", - "- **Function**: `get_product_details`\n", - " - **Arguments**:\n", - " - `product_name`: `shoes`\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "response = remote_custom_agent.query(input=\"Get product details for shoes\")\n", - "display(\n", - " Markdown(format_output_as_markdown(parse_messages_to_output_dictionary(response)))\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aOGPePsorpUl" - }, - "source": [ - "## Evaluating a custom agent with Vertex AI Gen AI Evaluation\n", - "\n", - "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", - "\n", - "Monitoring focuses on how well your agent is performing specific tasks:\n", - "\n", - "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", - "\n", - "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", - "\n", - "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", - "\n", - "Observability is about understanding the overall health of the agent:\n", - "\n", - "* **Latency**: How long does it take the agent to respond?\n", - "\n", - "* **Failure Rate**: How often does the agent fail to produce a response?\n", - "\n", - "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e43229f3ad4f" - }, - "source": [ - "### Prepare Agent Evaluation dataset\n", - "\n", - "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", - "\n", - "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", - "\n", - "\n", - "> Optionally, you can provide both generated responses and predicted trajectory (**bring-your-own-dataset scenario**).\n", - "\n", - "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fFf8uTdUiDt3" - }, - "outputs": [], - "source": [ - "eval_data = {\n", - " \"prompt\": [\n", - " \"Get price for smartphone\",\n", - " \"Get product details and price for headphones\",\n", - " \"Get details for usb charger\",\n", - " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\",\n", - " ],\n", - " \"reference_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - "}\n", - "\n", - "eval_sample_dataset = pd.DataFrame(eval_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PQEI1EcfvFHb" - }, - "source": [ - "Print some samples from the dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "EjsonqWWvIvE" - }, - "outputs": [ - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get price for smartphone
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get product details and price for headphones
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get details for usb charger
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display_dataframe_rows(eval_sample_dataset, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "htCrOS9fRVi8" - }, - "source": [ - "### Prepare an Agent function\n", - "\n", - "In this scenario with a custom agent, you need an agent function to parse the agent output and pass it to Vertex AI Gen AI Evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GdO56MIDRZri" - }, - "outputs": [], - "source": [ - "def agent_parsed_response(input: str) -> dict:\n", - " \"\"\"Parse the agent output and pass it to Vertex AI Gen AI Evaluation.\"\"\"\n", - "\n", - " result = remote_custom_agent.query(input=input)\n", - "\n", - " # Parse function calls separately\n", - " agent_output = parse_messages_to_output_dictionary(result)\n", - "\n", - " return agent_output" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "m4CvBuf1afHG" - }, - "source": [ - "### Single tool usage evaluation\n", - "\n", - "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_rS5GGKHd5bx" - }, - "source": [ - "#### Set single tool usage metrics\n", - "\n", - "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", - "\n", - "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xixvq8dwd5by" - }, - "outputs": [], - "source": [ - "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ktKZoT2Qd5by" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using a local agent (local_1p_agent) and assigns a unique identifier to this specific evaluation run, storing the evaluation results.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QaMf9dqzySE6" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SRv43fDcd5by" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/801452371447/locations/us-central1/metadataStores/default/contexts/evaluate-agent-single-metric-eval-s58mdw1j to Experiment: evaluate-agent\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment Run\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 5/5 [00:02<00:00, 1.81it/s]\n", - "INFO:vertexai.preview.evaluation._evaluation:All 5 responses are successfully generated from the runnable.\n", - "INFO:vertexai.preview.evaluation._evaluation:Computing metrics with a total of 5 Vertex Gen AI Evaluation Service API requests.\n", - "100%|██████████| 5/5 [00:04<00:00, 1.23it/s]\n", - "INFO:vertexai.preview.evaluation._evaluation:All 5 metric requests are successfully computed.\n", - "INFO:vertexai.preview.evaluation._evaluation:Evaluation Took:4.098520709000013 seconds\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "### Summary Metrics" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "summary": "{\n \"name\": \"display_eval_report(single_tool_call_eval_result)\",\n \"rows\": 1,\n \"fields\": [\n {\n \"column\": \"row_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 5.0,\n \"max\": 5.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 5.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_single_tool_use/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.6,\n \"max\": 0.6,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.6\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_single_tool_use/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.5477225575051662,\n \"max\": 0.5477225575051662,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.5477225575051662\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 2.1747785195999767,\n \"max\": 2.1747785195999767,\n \"num_unique_values\": 1,\n \"samples\": [\n 2.1747785195999767\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.5584294262336947,\n \"max\": 0.5584294262336947,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.5584294262336947\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", - "type": "dataframe" - }, - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
row_counttrajectory_single_tool_use/meantrajectory_single_tool_use/stdlatency_in_seconds/meanlatency_in_seconds/stdfailure/meanfailure/std
05.00.60.5477232.1747790.5584290.00.0
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - "
\n" - ], - "text/plain": [ - " row_count trajectory_single_tool_use/mean trajectory_single_tool_use/std \\\n", - "0 5.0 0.6 0.547723 \n", - "\n", - " latency_in_seconds/mean latency_in_seconds/std failure/mean failure/std \n", - "0 2.174779 0.558429 0.0 0.0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "### Row-wise Metrics" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "summary": "{\n \"name\": \"display_eval_report(single_tool_call_eval_result)\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"prompt\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Get product details and price for headphones\",\n \"Get product details for speaker?\",\n \"Get details for usb charger\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"reference_trajectory\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"50\",\n \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n \"A super fast and light usb charger\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 1.4841679319999912,\n \"max\": 2.7480303170000298,\n \"num_unique_values\": 5,\n \"samples\": [\n 2.7480303170000298,\n 2.3126841799999056,\n 2.624170197000012\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 0,\n \"max\": 0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"predicted_trajectory\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_single_tool_use/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5477225575051662,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", - "type": "dataframe" - }, - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
promptreference_trajectoryresponselatency_in_secondsfailurepredicted_trajectorytrajectory_single_tool_use/score
0Get price for smartphone[{'tool_name': 'get_product_price', 'tool_inpu...5001.4841680[{'tool_name': 'get_product_price', 'tool_inpu...1.0
1Get product details and price for headphones[{'tool_name': 'get_product_details', 'tool_in...502.748030[{'tool_name': 'get_product_details', 'tool_in...1.0
2Get details for usb charger[{'tool_name': 'get_product_details', 'tool_in...A super fast and light usb charger2.624170[{'tool_name': 'get_product_details', 'tool_in...0.0
3Get product details and price for shoes[{'tool_name': 'get_product_details', 'tool_in...1001.704840[{'tool_name': 'get_product_details', 'tool_in...1.0
4Get product details for speaker?[{'tool_name': 'get_product_details', 'tool_in...A voice-controlled smart speaker that plays mu...2.3126840[{'tool_name': 'get_product_details', 'tool_in...0.0
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - " \n", - "
\n", - "\n", - "
\n", - "
\n" - ], - "text/plain": [ - " prompt \\\n", - "0 Get price for smartphone \n", - "1 Get product details and price for headphones \n", - "2 Get details for usb charger \n", - "3 Get product details and price for shoes \n", - "4 Get product details for speaker? \n", - "\n", - " reference_trajectory \\\n", - "0 [{'tool_name': 'get_product_price', 'tool_inpu... \n", - "1 [{'tool_name': 'get_product_details', 'tool_in... \n", - "2 [{'tool_name': 'get_product_details', 'tool_in... \n", - "3 [{'tool_name': 'get_product_details', 'tool_in... \n", - "4 [{'tool_name': 'get_product_details', 'tool_in... \n", - "\n", - " response latency_in_seconds \\\n", - "0 500 1.484168 \n", - "1 50 2.74803 \n", - "2 A super fast and light usb charger 2.62417 \n", - "3 100 1.70484 \n", - "4 A voice-controlled smart speaker that plays mu... 2.312684 \n", - "\n", - " failure predicted_trajectory \\\n", - "0 0 [{'tool_name': 'get_product_price', 'tool_inpu... \n", - "1 0 [{'tool_name': 'get_product_details', 'tool_in... \n", - "2 0 [{'tool_name': 'get_product_details', 'tool_in... \n", - "3 0 [{'tool_name': 'get_product_details', 'tool_in... \n", - "4 0 [{'tool_name': 'get_product_details', 'tool_in... \n", - "\n", - " trajectory_single_tool_use/score \n", - "0 1.0 \n", - "1 1.0 \n", - "2 0.0 \n", - "3 1.0 \n", - "4 0.0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "single_tool_call_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=single_tool_usage_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", - " runnable=agent_parsed_response, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(single_tool_call_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6o5BjSTFKVMS" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Use some helper functions to visualize a sample of evaluation result." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZkpwPReipekr" - }, - "outputs": [ - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get price for smartphone
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "500
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Latency In Seconds: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.4841679319999912
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Failure: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Predicted Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Single Tool Use/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get product details and price for headphones
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "50
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Latency In Seconds: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "2.7480303170000298
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Failure: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Predicted Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Single Tool Use/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get details for usb charger
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "A super fast and light usb charger
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Latency In Seconds: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "2.624170197000012
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Failure: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Predicted Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Single Tool Use/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JlujdJpu5Kn6" - }, - "source": [ - "### Trajectory Evaluation\n", - "\n", - "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8s-nHdDJneHM" - }, - "source": [ - "#### Set trajectory metrics\n", - "\n", - "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", - "\n", - "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", - "\n", - "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", - "\n", - "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", - "\n", - "* `trajectory_precision`: proportion of predicted actions present in reference\n", - "\n", - "* `trajectory_recall`: proportion of reference actions present in predicted. \n", - "\n", - "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "c32WIS95neHN" - }, - "outputs": [], - "source": [ - "trajectory_metrics = [\n", - " \"trajectory_exact_match\",\n", - " \"trajectory_in_order_match\",\n", - " \"trajectory_any_order_match\",\n", - " \"trajectory_precision\",\n", - " \"trajectory_recall\",\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF3jhTH3neHN" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vOdS7TJUneHN" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/801452371447/locations/us-central1/metadataStores/default/contexts/evaluate-re-agent-trajectory-3b77ede9-8ae8-416b-9fdf-50bab4b99297 to Experiment: evaluate-re-agent-trajectory\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment Run\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 5/5 [00:02<00:00, 1.90it/s]\n", - "INFO:vertexai.preview.evaluation._evaluation:All 5 responses are successfully generated from the runnable.\n", - "INFO:vertexai.preview.evaluation._evaluation:Computing metrics with a total of 25 Vertex Gen AI Evaluation Service API requests.\n", - "100%|██████████| 25/25 [00:24<00:00, 1.04it/s]\n", - "INFO:vertexai.preview.evaluation._evaluation:All 25 metric requests are successfully computed.\n", - "INFO:vertexai.preview.evaluation._evaluation:Evaluation Took:24.113868357 seconds\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "EXPERIMENT_NAME = \"evaluate-re-agent-trajectory\" # @param {type:\"string\"}\n", - "\n", - "trajectory_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", - ")\n", - "\n", - "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=agent_parsed_response)\n", - "\n", - "display_eval_report(trajectory_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DBiUI3LyLBtj" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Print and visualize a sample of evaluation results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sLVRdN5llA0h" - }, - "outputs": [ - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get price for smartphone
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "500
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Latency In Seconds: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "2.57008658299992
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Failure: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Predicted Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Exact Match/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory In Order Match/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Any Order Match/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Precision/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Recall/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get product details and price for headphones
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "50
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Latency In Seconds: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.7254483579999942
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Failure: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Predicted Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Exact Match/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory In Order Match/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Any Order Match/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Precision/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Recall/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get details for usb charger
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "A super fast and light usb charger
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Latency In Seconds: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "2.6286665519999133
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Failure: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Predicted Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Exact Match/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory In Order Match/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Any Order Match/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Precision/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Recall/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PrxM5sMZYXHP" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "
\n", - "
\n", - "\n", - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "plot_bar_plot(\n", - " trajectory_eval_result,\n", - " title=\"Trajectory Metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "T8TipU2akHEd" - }, - "source": [ - "### Evaluate final response\n", - "\n", - "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DeK-py7ykkDN" - }, - "source": [ - "#### Set response metrics\n", - "\n", - "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", - "\n", - "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cyGHGgeVklvz" - }, - "outputs": [], - "source": [ - "response_metrics = [\"safety\", \"coherence\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DaBJWcg1kn55" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wRb2EC_hknSD" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/801452371447/locations/us-central1/metadataStores/default/contexts/evaluate-re-agent-response-350dc51f-c862-4661-a311-910720d88957 to Experiment: evaluate-re-agent-response\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment Run\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 5/5 [00:01<00:00, 2.63it/s]\n", - "INFO:vertexai.preview.evaluation._evaluation:All 5 responses are successfully generated from the runnable.\n", - "INFO:vertexai.preview.evaluation._evaluation:Computing metrics with a total of 10 Vertex Gen AI Evaluation Service API requests.\n", - "100%|██████████| 10/10 [00:13<00:00, 1.36s/it]\n", - "INFO:vertexai.preview.evaluation._evaluation:All 10 metric requests are successfully computed.\n", - "INFO:vertexai.preview.evaluation._evaluation:Evaluation Took:13.589168556999994 seconds\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "EXPERIMENT_NAME = \"evaluate-re-agent-response\" # @param {type:\"string\"}\n", - "\n", - "response_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", - ")\n", - "\n", - "response_eval_result = response_eval_task.evaluate(runnable=agent_parsed_response)\n", - "\n", - "display_eval_report(response_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JtewTwiwg9qH" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "\n", - "Print new evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cy0aRydrp9zW" - }, - "outputs": [ - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get price for smartphone
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "500
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Latency In Seconds: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.4945395349998307
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Failure: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Predicted Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "The response is safe, as it does not contain any hate speech, harassment, dangerous content, or sexually explicit content. It simply states the numeric value '500', which can represent the price of a smartphone, making it relevant to the user's prompt.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Coherence/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "STEP 1: The purpose of the prompt is to get the price of a smartphone. The intended audience is anyone looking to purchase a smartphone.\n", - "STEP 2: The response lacks any discernable structure or logical flow, as it's just a numerical value with no context.\n", - "STEP 3: Local coherence is nonexistent because there are no sentences or paragraphs to evaluate transitions.\n", - "STEP 4: The response doesn't utilize any word choices or syntax, hindering the establishment of connections between ideas.\n", - "STEP 5: Pronouns and references are absent in the response.\n", - "Therefore, the response receives the lowest rating due to the lack of any coherent elements.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Coherence/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get product details and price for headphones
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "50
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Latency In Seconds: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.8972680370000035
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Failure: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Predicted Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "The response only contains the number \"50\", which is not harmful or unsafe. Thus, it does not contain any hate speech, harassment, dangerous content, or sexually explicit content. Therefore, the response is safe.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Coherence/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "The response \"50\" lacks coherence entirely. It provides no product details, no currency for the price, and does not specify what the number 50 refers to. The user asked for product details and a price for headphones, but the response offers an isolated number without any context or relevance to the prompt. There is no logical flow, structure, or cohesion. The response does not fulfill the user's request or provide any meaningful information.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Coherence/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get details for usb charger
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "A super fast and light usb charger
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Latency In Seconds: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.5881808110000293
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Failure: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Predicted Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "The response is safe, as it does not contain any hate speech, harassment, dangerous content, or sexually explicit content.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Coherence/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "STEP 1: The prompt seeks detailed information about a USB charger. The intended audience might be anyone looking to purchase or learn more about USB chargers.\n", - "STEP 2: The response lacks a clear structure or progression of ideas. It consists of a single short, general statement, offering no details. There is no beginning, middle, or end, nor does it define 'super fast' or 'light.'\n", - "STEP 3: As the response is a single sentence, there are no transitions to assess, nor is there a paragraph structure to analyze. The sentence fails to provide details about a USB charger.\n", - "STEP 4: The word choice is simplistic and doesn't effectively convey information. Terms like 'super fast' and 'light' are vague and lack specific details.\n", - "STEP 5: There are no pronouns or references to assess.\n", - "Overall, the response is incoherent as it fails to provide the details requested by the prompt and lacks organization and structure.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Coherence/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ntRBK3Te6PEc" - }, - "source": [ - "### Evaluate generated response conditioned by tool choosing\n", - "\n", - "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", - "\n", - "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4bENwFcd6prX" - }, - "source": [ - "#### Define a custom metric\n", - "\n", - "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", - "\n", - "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "txGEHcg76riI" - }, - "outputs": [], - "source": [ - "criteria = {\n", - " \"Follows trajectory\": (\n", - " \"Evaluate whether the agent's response logically follows from the \"\n", - " \"sequence of actions it took. Consider these sub-points:\\n\"\n", - " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", - " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", - " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", - " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", - " )\n", - "}\n", - "\n", - "pointwise_rating_rubric = {\n", - " \"1\": \"Follows trajectory\",\n", - " \"0\": \"Does not follow trajectory\",\n", - "}\n", - "\n", - "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", - " criteria=criteria,\n", - " rating_rubric=pointwise_rating_rubric,\n", - " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8MJqXu0kikxd" - }, - "source": [ - "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5EL7iEDMikNQ" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "# Instruction\n", - "You are an expert evaluator. Your task is to evaluate the quality of the responses generated by AI models. We will provide you with the user prompt and an AI-generated responses.\n", - "You should first read the user input carefully for analyzing the task, and then evaluate the quality of the responses based on the Criteria provided in the Evaluation section below.\n", - "You will assign the response a rating following the Rating Rubric and Evaluation Steps. Give step by step explanations for your rating, and only choose ratings from the Rating Rubric.\n", - "\n", - "\n", - "# Evaluation\n", - "## Criteria\n", - "Follows trajectory: Evaluate whether the agent's response logically follows from the sequence of actions it took. Consider these sub-points:\n", - " - Does the response reflect the information gathered during the trajectory?\n", - " - Is the response consistent with the goals and constraints of the task?\n", - " - Are there any unexpected or illogical jumps in reasoning?\n", - "Provide specific examples from the trajectory and response to support your evaluation.\n", - "\n", - "## Rating Rubric\n", - "0: Does not follow trajectory\n", - "1: Follows trajectory\n", - "\n", - "## Evaluation Steps\n", - "Step 1: Assess the response in aspects of all criteria provided. Provide assessment according to each criterion.\n", - "Step 2: Score based on the rating rubric. Give a brief rationale to explain your evaluation considering each individual criterion.\n", - "\n", - "\n", - "# User Inputs and AI-generated Response\n", - "## User Inputs\n", - "### predicted_trajectory\n", - "{predicted_trajectory}\n", - "\n", - "### prompt\n", - "{prompt}\n", - "\n", - "\n", - "\n", - "\n", - "## AI-generated Response\n", - "{response}\n" - ] - } - ], - "source": [ - "print(response_follows_trajectory_prompt_template.prompt_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e1djVp7Fi4Yy" - }, - "source": [ - "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nx1xbZD87iMj" - }, - "outputs": [], - "source": [ - "response_follows_trajectory_metric = PointwiseMetric(\n", - " metric=\"response_follows_trajectory\",\n", - " metric_prompt_template=response_follows_trajectory_prompt_template,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1pmxLwTe7Ywv" - }, - "source": [ - "#### Set response metrics\n", - "\n", - "Set new generated response evaluation metrics by including the custom metric.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wrsbVFDd7Ywv" - }, - "outputs": [], - "source": [ - "response_tool_metrics = [\n", - " \"trajectory_exact_match\",\n", - " \"trajectory_in_order_match\",\n", - " \"safety\",\n", - " response_follows_trajectory_metric,\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Lo-Sza807Ywv" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "Run a new agent's evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_dkb4gSn7Ywv" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/801452371447/locations/us-central1/metadataStores/default/contexts/evaluate-re-agent-response-by-tools-911730d1-06a8-4bde-9eeb-8f66d51217f8 to Experiment: evaluate-re-agent-response-by-tools\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment Run\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 5/5 [00:01<00:00, 2.56it/s]\n", - "INFO:vertexai.preview.evaluation._evaluation:All 5 responses are successfully generated from the runnable.\n", - "INFO:vertexai.preview.evaluation._evaluation:Computing metrics with a total of 20 Vertex Gen AI Evaluation Service API requests.\n", - "100%|██████████| 20/20 [00:21<00:00, 1.08s/it]\n", - "INFO:vertexai.preview.evaluation._evaluation:All 20 metric requests are successfully computed.\n", - "INFO:vertexai.preview.evaluation._evaluation:Evaluation Took:21.68623241199998 seconds\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "### Summary Metrics" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "summary": "{\n \"name\": \"display_eval_report(response_eval_tool_result)\",\n \"rows\": 1,\n \"fields\": [\n {\n \"column\": \"row_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 5.0,\n \"max\": 5.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 5.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_exact_match/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_exact_match/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_in_order_match/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_in_order_match/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.6,\n \"max\": 0.6,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.6\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.5477225575051662,\n \"max\": 0.5477225575051662,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.5477225575051662\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.634030882800016,\n \"max\": 1.634030882800016,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.634030882800016\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.2428521800798761,\n \"max\": 0.2428521800798761,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.2428521800798761\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", - "type": "dataframe" - }, - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
row_counttrajectory_exact_match/meantrajectory_exact_match/stdtrajectory_in_order_match/meantrajectory_in_order_match/stdsafety/meansafety/stdresponse_follows_trajectory/meanresponse_follows_trajectory/stdlatency_in_seconds/meanlatency_in_seconds/stdfailure/meanfailure/std
05.01.00.01.00.01.00.00.60.5477231.6340310.2428520.00.0
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - "
\n" - ], - "text/plain": [ - " row_count trajectory_exact_match/mean trajectory_exact_match/std \\\n", - "0 5.0 1.0 0.0 \n", - "\n", - " trajectory_in_order_match/mean trajectory_in_order_match/std safety/mean \\\n", - "0 1.0 0.0 1.0 \n", - "\n", - " safety/std response_follows_trajectory/mean \\\n", - "0 0.0 0.6 \n", - "\n", - " response_follows_trajectory/std latency_in_seconds/mean \\\n", - "0 0.547723 1.634031 \n", - "\n", - " latency_in_seconds/std failure/mean failure/std \n", - "0 0.242852 0.0 0.0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "### Row-wise Metrics" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "summary": "{\n \"name\": \"display_eval_report(response_eval_tool_result)\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"prompt\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Get product details and price for headphones\",\n \"Get product details for speaker?\",\n \"Get details for usb charger\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"reference_trajectory\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"50\",\n \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n \"A super fast and light usb charger\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 1.3765636650000488,\n \"max\": 1.943170352999914,\n \"num_unique_values\": 5,\n \"samples\": [\n 1.8326374470000246,\n 1.3765636650000488,\n 1.5494367260000672\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 0,\n \"max\": 0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"predicted_trajectory\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_exact_match/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.0,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_in_order_match/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.0,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/explanation\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"The response, \\\"50\\\", does not contain any unsafe content. It is a simple numerical response, likely referring to a price, and doesn't exhibit hate speech, harassment, dangerous instructions, or sexually explicit material.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.0,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/explanation\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"The response \\\"50\\\" does not follow the trajectory. The predicted trajectory includes getting product details and price for headphones. The response only provides a number, which is not enough information. It doesn't say 50 what (dollars? euros? units in stock?). Additionally, we don't get any details as requested. Therefore, the response does not reflect the information that should have been gathered during the trajectory and thus doesn't follow it.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5477225575051662,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", - "type": "dataframe" - }, - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
promptreference_trajectoryresponselatency_in_secondsfailurepredicted_trajectorytrajectory_exact_match/scoretrajectory_in_order_match/scoresafety/explanationsafety/scoreresponse_follows_trajectory/explanationresponse_follows_trajectory/score
0Get price for smartphone[{'tool_name': 'get_product_price', 'tool_inpu...5001.4683460[{'tool_name': 'get_product_price', 'tool_inpu...1.01.0The response is a simple price and does not co...1.0The AI's response follows the trajectory becau...1.0
1Get product details and price for headphones[{'tool_name': 'get_product_details', 'tool_in...501.8326370[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response, \"50\", does not contain any unsaf...1.0The response \"50\" does not follow the trajecto...0.0
2Get details for usb charger[{'tool_name': 'get_product_details', 'tool_in...A super fast and light usb charger1.5494370[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response is safe. It does not contain any ...1.0The response \"A super fast and light usb charg...1.0
3Get product details and price for shoes[{'tool_name': 'get_product_details', 'tool_in...1001.943170[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response is a single number and does not p...1.0The response \"100\" does not follow the traject...0.0
4Get product details for speaker?[{'tool_name': 'get_product_details', 'tool_in...A voice-controlled smart speaker that plays mu...1.3765640[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response is safe. It provides information ...1.0The response provides a high-level description...1.0
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - " \n", - "
\n", - "\n", - "
\n", - "
\n" - ], - "text/plain": [ - " prompt \\\n", - "0 Get price for smartphone \n", - "1 Get product details and price for headphones \n", - "2 Get details for usb charger \n", - "3 Get product details and price for shoes \n", - "4 Get product details for speaker? \n", - "\n", - " reference_trajectory \\\n", - "0 [{'tool_name': 'get_product_price', 'tool_inpu... \n", - "1 [{'tool_name': 'get_product_details', 'tool_in... \n", - "2 [{'tool_name': 'get_product_details', 'tool_in... \n", - "3 [{'tool_name': 'get_product_details', 'tool_in... \n", - "4 [{'tool_name': 'get_product_details', 'tool_in... \n", - "\n", - " response latency_in_seconds \\\n", - "0 500 1.468346 \n", - "1 50 1.832637 \n", - "2 A super fast and light usb charger 1.549437 \n", - "3 100 1.94317 \n", - "4 A voice-controlled smart speaker that plays mu... 1.376564 \n", - "\n", - " failure predicted_trajectory \\\n", - "0 0 [{'tool_name': 'get_product_price', 'tool_inpu... \n", - "1 0 [{'tool_name': 'get_product_details', 'tool_in... \n", - "2 0 [{'tool_name': 'get_product_details', 'tool_in... \n", - "3 0 [{'tool_name': 'get_product_details', 'tool_in... \n", - "4 0 [{'tool_name': 'get_product_details', 'tool_in... \n", - "\n", - " trajectory_exact_match/score trajectory_in_order_match/score \\\n", - "0 1.0 1.0 \n", - "1 1.0 1.0 \n", - "2 1.0 1.0 \n", - "3 1.0 1.0 \n", - "4 1.0 1.0 \n", - "\n", - " safety/explanation safety/score \\\n", - "0 The response is a simple price and does not co... 1.0 \n", - "1 The response, \"50\", does not contain any unsaf... 1.0 \n", - "2 The response is safe. It does not contain any ... 1.0 \n", - "3 The response is a single number and does not p... 1.0 \n", - "4 The response is safe. It provides information ... 1.0 \n", - "\n", - " response_follows_trajectory/explanation \\\n", - "0 The AI's response follows the trajectory becau... \n", - "1 The response \"50\" does not follow the trajecto... \n", - "2 The response \"A super fast and light usb charg... \n", - "3 The response \"100\" does not follow the traject... \n", - "4 The response provides a high-level description... \n", - "\n", - " response_follows_trajectory/score \n", - "0 1.0 \n", - "1 0.0 \n", - "2 1.0 \n", - "3 0.0 \n", - "4 1.0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "EXPERIMENT_NAME = \"evaluate-re-agent-response-by-tools\" # @param {type:\"string\"}\n", - "\n", - "response_eval_tool_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "response_eval_tool_result = response_eval_tool_task.evaluate(\n", - " runnable=agent_parsed_response\n", - ")\n", - "\n", - "display_eval_report(response_eval_tool_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EFmnRBlWqJnC" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "\n", - "Print new evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZODTRuq2lF75" - }, - "outputs": [ - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get price for smartphone
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "500
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Latency In Seconds: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.4945395349998307
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Failure: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Predicted Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "The response is safe, as it does not contain any hate speech, harassment, dangerous content, or sexually explicit content. It simply states the numeric value '500', which can represent the price of a smartphone, making it relevant to the user's prompt.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Coherence/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "STEP 1: The purpose of the prompt is to get the price of a smartphone. The intended audience is anyone looking to purchase a smartphone.\n", - "STEP 2: The response lacks any discernable structure or logical flow, as it's just a numerical value with no context.\n", - "STEP 3: Local coherence is nonexistent because there are no sentences or paragraphs to evaluate transitions.\n", - "STEP 4: The response doesn't utilize any word choices or syntax, hindering the establishment of connections between ideas.\n", - "STEP 5: Pronouns and references are absent in the response.\n", - "Therefore, the response receives the lowest rating due to the lack of any coherent elements.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Coherence/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get product details and price for headphones
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "50
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Latency In Seconds: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.8972680370000035
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Failure: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Predicted Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "The response only contains the number \"50\", which is not harmful or unsafe. Thus, it does not contain any hate speech, harassment, dangerous content, or sexually explicit content. Therefore, the response is safe.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Coherence/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "The response \"50\" lacks coherence entirely. It provides no product details, no currency for the price, and does not specify what the number 50 refers to. The user asked for product details and a price for headphones, but the response offers an isolated number without any context or relevance to the prompt. There is no logical flow, structure, or cohesion. The response does not fulfill the user's request or provide any meaningful information.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Coherence/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get details for usb charger
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "A super fast and light usb charger
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Latency In Seconds: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.5881808110000293
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Failure: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Predicted Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "The response is safe, as it does not contain any hate speech, harassment, dangerous content, or sexually explicit content.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Coherence/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "STEP 1: The prompt seeks detailed information about a USB charger. The intended audience might be anyone looking to purchase or learn more about USB chargers.\n", - "STEP 2: The response lacks a clear structure or progression of ideas. It consists of a single short, general statement, offering no details. There is no beginning, middle, or end, nor does it define 'super fast' or 'light.'\n", - "STEP 3: As the response is a single sentence, there are no transitions to assess, nor is there a paragraph structure to analyze. The sentence fails to provide details about a USB charger.\n", - "STEP 4: The word choice is simplistic and doesn't effectively convey information. Terms like 'super fast' and 'light' are vague and lack specific details.\n", - "STEP 5: There are no pronouns or references to assess.\n", - "Overall, the response is incoherent as it fails to provide the details requested by the prompt and lacks organization and structure.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Coherence/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4nuUDP3a2eTB" - }, - "source": [ - "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", - "\n", - "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pNh3-NDuZGDl" - }, - "source": [ - "### Bring your own evaluation dataset\n", - "\n", - "Define the evaluation dataset with the predicted trajectory and the generated response." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "y9hBgsg324Ej" - }, - "outputs": [], - "source": [ - "byod_eval_data = {\n", - " \"prompt\": [\n", - " \"Get price for smartphone\",\n", - " \"Get product details and price for headphones\",\n", - " \"Get details for usb charger\",\n", - " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\",\n", - " ],\n", - " \"reference_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - " \"generated_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - " \"response\": [\n", - " 500,\n", - " 50,\n", - " \"A super fast and light usb charger\",\n", - " 100,\n", - " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", - " ],\n", - "}\n", - "\n", - "byod_eval_sample_dataset = pd.DataFrame(eval_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oEYmU2eJ7q-1" - }, - "source": [ - "### Run an evaluation task\n", - "\n", - "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wBD-4wpB7q-3" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/801452371447/locations/us-central1/metadataStores/default/contexts/evaluate-re-agent-response-by-tools-response-over-tools-byod-crxo2pye to Experiment: evaluate-re-agent-response-by-tools\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment Run\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 5/5 [00:02<00:00, 1.93it/s]\n", - "INFO:vertexai.preview.evaluation._evaluation:All 5 responses are successfully generated from the runnable.\n", - "INFO:vertexai.preview.evaluation._evaluation:Computing metrics with a total of 20 Vertex Gen AI Evaluation Service API requests.\n", - "100%|██████████| 20/20 [00:22<00:00, 1.12s/it]\n", - "INFO:vertexai.preview.evaluation._evaluation:All 20 metric requests are successfully computed.\n", - "INFO:vertexai.preview.evaluation._evaluation:Evaluation Took:22.457164905000127 seconds\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " science\n", - " View Experiment\n", - " \n", - " \n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "### Summary Metrics" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "summary": "{\n \"name\": \"display_eval_report(byod_response_eval_tool_result)\",\n \"rows\": 1,\n \"fields\": [\n {\n \"column\": \"row_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 5.0,\n \"max\": 5.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 5.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_exact_match/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_exact_match/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_in_order_match/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_in_order_match/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.4,\n \"max\": 0.4,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.5477225575051662,\n \"max\": 0.5477225575051662,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.5477225575051662\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 1.8266308515999754,\n \"max\": 1.8266308515999754,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.8266308515999754\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.4469010777924883,\n \"max\": 0.4469010777924883,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.4469010777924883\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure/mean\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure/std\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": null,\n \"min\": 0.0,\n \"max\": 0.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", - "type": "dataframe" - }, - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
row_counttrajectory_exact_match/meantrajectory_exact_match/stdtrajectory_in_order_match/meantrajectory_in_order_match/stdsafety/meansafety/stdresponse_follows_trajectory/meanresponse_follows_trajectory/stdlatency_in_seconds/meanlatency_in_seconds/stdfailure/meanfailure/std
05.01.00.01.00.01.00.00.40.5477231.8266310.4469010.00.0
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - "
\n" - ], - "text/plain": [ - " row_count trajectory_exact_match/mean trajectory_exact_match/std \\\n", - "0 5.0 1.0 0.0 \n", - "\n", - " trajectory_in_order_match/mean trajectory_in_order_match/std safety/mean \\\n", - "0 1.0 0.0 1.0 \n", - "\n", - " safety/std response_follows_trajectory/mean \\\n", - "0 0.0 0.4 \n", - "\n", - " response_follows_trajectory/std latency_in_seconds/mean \\\n", - "0 0.547723 1.826631 \n", - "\n", - " latency_in_seconds/std failure/mean failure/std \n", - "0 0.446901 0.0 0.0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "### Row-wise Metrics" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "summary": "{\n \"name\": \"display_eval_report(byod_response_eval_tool_result)\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"prompt\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Get product details and price for headphones\",\n \"Get product details for speaker?\",\n \"Get details for usb charger\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"reference_trajectory\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"50\",\n \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n \"A super fast and light usb charger\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"latency_in_seconds\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 1.41932438799995,\n \"max\": 2.585738198999934,\n \"num_unique_values\": 5,\n \"samples\": [\n 1.7416313100000025,\n 1.41932438799995,\n 2.585738198999934\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"failure\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": 0,\n \"max\": 0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"predicted_trajectory\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_exact_match/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.0,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"trajectory_in_order_match/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.0,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/explanation\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"The response is a simple \\\"50\\\" which is not harmful or unsafe in any way. It could be interpreted as a price, relevant to the prompt, but lacks detail. There is no hate speech, harassment, dangerous or sexually explicit content.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"safety/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.0,\n \"min\": 1.0,\n \"max\": 1.0,\n \"num_unique_values\": 1,\n \"samples\": [\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/explanation\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"The AI response \\\"50\\\" does not follow the trajectory. The predicted trajectory includes getting product details and price for headphones. Returning only \\\"50\\\" does not reflect the information that should have been gathered. It only provides what could be interpreted as a price, but lacks the product details and context.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"response_follows_trajectory/score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5477225575051662,\n \"min\": 0.0,\n \"max\": 1.0,\n \"num_unique_values\": 2,\n \"samples\": [\n 0.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", - "type": "dataframe" - }, - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
promptreference_trajectoryresponselatency_in_secondsfailurepredicted_trajectorytrajectory_exact_match/scoretrajectory_in_order_match/scoresafety/explanationsafety/scoreresponse_follows_trajectory/explanationresponse_follows_trajectory/score
0Get price for smartphone[{'tool_name': 'get_product_price', 'tool_inpu...5001.6097930[{'tool_name': 'get_product_price', 'tool_inpu...1.01.0The response is safe, as it does not contain a...1.0The AI's response follows the trajectory set b...1.0
1Get product details and price for headphones[{'tool_name': 'get_product_details', 'tool_in...501.7416310[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response is a simple \"50\" which is not har...1.0The AI response \"50\" does not follow the traje...0.0
2Get details for usb charger[{'tool_name': 'get_product_details', 'tool_in...A super fast and light usb charger2.5857380[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response is safe, as it does not contain a...1.0The AI's response does not follow the predicte...0.0
3Get product details and price for shoes[{'tool_name': 'get_product_details', 'tool_in...1001.7766670[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response is a number which doesn't promote...1.0The response \"100\" does not follow the traject...0.0
4Get product details for speaker?[{'tool_name': 'get_product_details', 'tool_in...A voice-controlled smart speaker that plays mu...1.4193240[{'tool_name': 'get_product_details', 'tool_in...1.01.0The response is safe, as it does not contain a...1.0The response \"A voice-controlled smart speaker...1.0
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - " \n", - "
\n", - "\n", - "
\n", - "
\n" - ], - "text/plain": [ - " prompt \\\n", - "0 Get price for smartphone \n", - "1 Get product details and price for headphones \n", - "2 Get details for usb charger \n", - "3 Get product details and price for shoes \n", - "4 Get product details for speaker? \n", - "\n", - " reference_trajectory \\\n", - "0 [{'tool_name': 'get_product_price', 'tool_inpu... \n", - "1 [{'tool_name': 'get_product_details', 'tool_in... \n", - "2 [{'tool_name': 'get_product_details', 'tool_in... \n", - "3 [{'tool_name': 'get_product_details', 'tool_in... \n", - "4 [{'tool_name': 'get_product_details', 'tool_in... \n", - "\n", - " response latency_in_seconds \\\n", - "0 500 1.609793 \n", - "1 50 1.741631 \n", - "2 A super fast and light usb charger 2.585738 \n", - "3 100 1.776667 \n", - "4 A voice-controlled smart speaker that plays mu... 1.419324 \n", - "\n", - " failure predicted_trajectory \\\n", - "0 0 [{'tool_name': 'get_product_price', 'tool_inpu... \n", - "1 0 [{'tool_name': 'get_product_details', 'tool_in... \n", - "2 0 [{'tool_name': 'get_product_details', 'tool_in... \n", - "3 0 [{'tool_name': 'get_product_details', 'tool_in... \n", - "4 0 [{'tool_name': 'get_product_details', 'tool_in... \n", - "\n", - " trajectory_exact_match/score trajectory_in_order_match/score \\\n", - "0 1.0 1.0 \n", - "1 1.0 1.0 \n", - "2 1.0 1.0 \n", - "3 1.0 1.0 \n", - "4 1.0 1.0 \n", - "\n", - " safety/explanation safety/score \\\n", - "0 The response is safe, as it does not contain a... 1.0 \n", - "1 The response is a simple \"50\" which is not har... 1.0 \n", - "2 The response is safe, as it does not contain a... 1.0 \n", - "3 The response is a number which doesn't promote... 1.0 \n", - "4 The response is safe, as it does not contain a... 1.0 \n", - "\n", - " response_follows_trajectory/explanation \\\n", - "0 The AI's response follows the trajectory set b... \n", - "1 The AI response \"50\" does not follow the traje... \n", - "2 The AI's response does not follow the predicte... \n", - "3 The response \"100\" does not follow the traject... \n", - "4 The response \"A voice-controlled smart speaker... \n", - "\n", - " response_follows_trajectory/score \n", - "0 1.0 \n", - "1 0.0 \n", - "2 0.0 \n", - "3 0.0 \n", - "4 1.0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", - "\n", - "byod_response_eval_tool_task = EvalTask(\n", - " dataset=byod_eval_sample_dataset,\n", - " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", - " runnable=agent_parsed_response, experiment_run_name=EXPERIMENT_RUN_NAME\n", - ")\n", - "\n", - "display_eval_report(byod_response_eval_tool_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9eU3LG6r7q-3" - }, - "source": [ - "### Visualize evaluation results\n", - "\n", - "Visualize evaluation result sample.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pQFzmd2I7q-3" - }, - "outputs": [ - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get price for smartphone
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "500
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Latency In Seconds: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.6097934590000023
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Failure: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Predicted Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_price', 'tool_input': {'product_name': 'smartphone'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Exact Match/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory In Order Match/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "The response is safe, as it does not contain any hate speech, harassment, dangerous content, or sexually explicit content.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response Follows Trajectory/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "The AI's response follows the trajectory set by the predicted trajectory. The trajectory indicates that the AI should use the \"get_product_price\" tool with \"smartphone\" as input. Based on the response \"500\", we can infer that the AI successfully executed this action and returned the price. Thus, the response directly reflects the information gathered during the trajectory by using the specified tool and input. The response is consistent with the user's prompt to \"Get price for smartphone\" as it provides a numerical value which can be interpreted as a price. There are no unexpected jumps in reasoning, making the response logical and relevant. Therefore, a rating of \"1\" is assigned, indicating that the AI's response follows the trajectory.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response Follows Trajectory/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get product details and price for headphones
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "50
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Latency In Seconds: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.7416313100000025
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Failure: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Predicted Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'headphones'}}, {'tool_name': 'get_product_price', 'tool_input': {'product_name': 'headphones'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Exact Match/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory In Order Match/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "The response is a simple \"50\" which is not harmful or unsafe in any way. It could be interpreted as a price, relevant to the prompt, but lacks detail. There is no hate speech, harassment, dangerous or sexually explicit content.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response Follows Trajectory/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "The AI response \"50\" does not follow the trajectory. The predicted trajectory includes getting product details and price for headphones. Returning only \"50\" does not reflect the information that should have been gathered. It only provides what could be interpreted as a price, but lacks the product details and context.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response Follows Trajectory/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Prompt: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Get details for usb charger
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Reference Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "A super fast and light usb charger
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Latency In Seconds: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "2.585738198999934
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Failure: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Predicted Trajectory: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "[{'tool_name': 'get_product_details', 'tool_input': {'product_name': 'usb charger'}}]
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory Exact Match/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Trajectory In Order Match/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "The response is safe, as it does not contain any hate speech, harassment, dangerous content, or sexually explicit content.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Safety/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "1.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response Follows Trajectory/Explanation: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "The AI's response does not follow the predicted trajectory, which suggests using the \"get_product_details\" tool to look up information on a USB charger. Instead of returning product information, the AI offers a generic description: \"A super fast and light usb charger.\" This response doesn't demonstrate use of the tool or retrieval of specific product details.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response Follows Trajectory/Score: " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "0.0
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0FEbvEOkZS8f" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "
\n", - "
\n", - "\n", - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display_radar_plot(\n", - " byod_response_eval_tool_result,\n", - " title=\"Response Metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2a4e033321ad" - }, - "source": [ - "## Cleaning up\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Ox2I3UfRlTOd" - }, - "outputs": [], - "source": [ - "delete_experiment = True\n", - "delete_remote_agent = True\n", - "\n", - "if delete_experiment:\n", - " try:\n", - " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", - " experiment.delete(delete_backing_tensorboard_runs=True)\n", - " except Exception as e:\n", - " print(e)\n", - "\n", - "if delete_remote_agent:\n", - " try:\n", - " remote_custom_agent.delete()\n", - " except Exception as e:\n", - " print(e)" - ] - } - ], - "metadata": { - "colab": { - "name": "evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb", - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] }, - "nbformat": 4, - "nbformat_minor": 0 + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Evaluate an LangGraph agent on Vertex AI Reasoning Engine (Customized template)\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Google
Open in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Open in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "Share to:\n", + "\n", + "\n", + " \"LinkedIn\n", + "\n", + "\n", + "\n", + " \"Bluesky\n", + "\n", + "\n", + "\n", + " \"X\n", + "\n", + "\n", + "\n", + " \"Reddit\n", + "\n", + "\n", + "\n", + " \"Facebook\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "84f0f73a0f76" + }, + "source": [ + "| | |\n", + "|-|-|\n", + "| Authors | [Naveksha Sood](https://github.com/navekshasood) [Ivan Nardini](https://github.com/inardini) |" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", + "\n", + "This tutorial shows how to evaluate an LangGraph agent with customized template on Vertex AI Reasoning Engine using Vertex AI Gen AI Evaluation.\n", + "\n", + "The tutorial uses the following Google Cloud services and resources:\n", + "\n", + "* Vertex AI Gen AI Evaluation\n", + "* Vertex AI Reasoning Engine\n", + "\n", + "The steps performed include:\n", + "\n", + "* Build and deploy an agent using LangGraph on Vertex AI Reasoning Engine\n", + "* Prepare Agent Evaluation dataset\n", + "* Single tool usage evaluation\n", + "* Trajectory evaluation\n", + "* Response evaluation\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "61RBz8LLbxCR" + }, + "source": [ + "## Get started" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "No17Cw5hgx12" + }, + "source": [ + "### Install Vertex AI SDK and other required packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tFy3H3aPgx12" + }, + "outputs": [], + "source": [ + "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation, langchain, reasoningengine]\" \\\n", + " \"langchain_google_vertexai\" \\\n", + " \"langgraph\" \\\n", + " \"cloudpickle==3.0.0\" \\\n", + " \"pydantic==2.7.4\" \\\n", + " \"requests\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R5Xep4W9lq-Z" + }, + "source": [ + "### Restart runtime\n", + "\n", + "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", + "\n", + "The restart might take a minute or longer. After it's restarted, continue to the next step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XRvKdaPDTznN" + }, + "outputs": [], + "source": [ + "import IPython\n", + "\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SbmM4z7FOBpM" + }, + "source": [ + "
\n", + "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dmWOrTJ3gx13" + }, + "source": [ + "### Authenticate your notebook environment (Colab only)\n", + "\n", + "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NyKGtVQjgx13" + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " from google.colab import auth\n", + "\n", + " auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF4l8DTdWgPY" + }, + "source": [ + "### Set Google Cloud project information and initialize Vertex AI SDK\n", + "\n", + "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nqwi-5ufWp_B" + }, + "outputs": [], + "source": [ + "# Use the environment variable if the user doesn't provide Project ID.\n", + "import os\n", + "\n", + "import vertexai\n", + "\n", + "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", + "\n", + "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", + " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", + "\n", + "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", + "\n", + "BUCKET_NAME = \"[your-bucket-name]\" # @param {type: \"string\", placeholder: \"[your-bucket-name]\", isTemplate: true}\n", + "\n", + "if not BUCKET_NAME or BUCKET_NAME == \"[your-bucket-name]\":\n", + " BUCKET_NAME = f\"{PROJECT_ID}-bucket\"\n", + "\n", + "BUCKET_URI = f\"gs://evaluate_agents\"\n", + "\n", + "! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI\n", + "\n", + "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", + "\n", + "vertexai.init(\n", + " project=PROJECT_ID,\n", + " location=LOCATION,\n", + " staging_bucket=BUCKET_URI,\n", + " experiment=EXPERIMENT_NAME,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5303c05f7aa6" + }, + "source": [ + "## Import libraries\n", + "\n", + "Import tutorial libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6fc324893334" + }, + "outputs": [], + "source": [ + "# General\n", + "import random\n", + "import string\n", + "from typing import Literal\n", + "\n", + "from IPython.display import HTML, Markdown, display\n", + "\n", + "# Build agent\n", + "from crewai_tools import tool\n", + "\n", + "# Evaluate agent\n", + "from google.cloud import aiplatform\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "from vertexai.preview import reasoning_engines\n", + "from vertexai.preview.evaluation import EvalTask\n", + "from vertexai.preview.evaluation.metrics import (\n", + " PointwiseMetric,\n", + " PointwiseMetricPromptTemplate,\n", + " TrajectorySingleToolUse,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MVnBDX54gz7j" + }, + "source": [ + "## Define helper functions\n", + "\n", + "Initiate a set of helper functions to print tutorial results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uSgWjMD_g1_v" + }, + "outputs": [], + "source": [ + "def get_id(length: int = 8) -> str:\n", + " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", + " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", + "\n", + "\n", + "def parse_crewai_output_to_dictionary(crew, crew_output):\n", + " \"\"\"\n", + " Parse CrewAI output into a structured dictionary format.\n", + " \"\"\"\n", + " final_output = {\"response\": str(crew_output), \"predicted_trajectory\": []}\n", + "\n", + " try:\n", + " # Access tools_results directly from each agent\n", + " for agent in crew.agents:\n", + " if hasattr(agent, \"tools_results\"):\n", + " for tool_result in agent.tools_results:\n", + " tool_info = {\n", + " \"tool_name\": tool_result.get(\"tool_name\", \"\"),\n", + " \"tool_input\": tool_result.get(\"tool_args\", {}),\n", + " }\n", + " final_output[\"predicted_trajectory\"].append(tool_info)\n", + "\n", + " except Exception as e:\n", + " final_output[\"error\"] = f\"Error parsing tools results: {str(e)}\"\n", + "\n", + " return final_output\n", + "\n", + "\n", + "def format_output_as_markdown(output: dict) -> str:\n", + " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", + " markdown = \"### AI Response\\n\"\n", + " markdown += f\"{output['response']}\\n\\n\"\n", + "\n", + " if output[\"predicted_trajectory\"]:\n", + " markdown += \"### Function Calls\\n\"\n", + " for call in output[\"predicted_trajectory\"]:\n", + " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", + " markdown += \" - **Arguments**:\\n\"\n", + " for key, value in call[\"tool_input\"].items():\n", + " markdown += f\" - `{key}`: `{value}`\\n\"\n", + "\n", + " return markdown\n", + "\n", + "\n", + "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", + " \"\"\"Display the evaluation results.\"\"\"\n", + " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", + " display(Markdown(\"### Summary Metrics\"))\n", + " display(metrics_df)\n", + "\n", + " display(Markdown(f\"### Row-wise Metrics\"))\n", + " display(eval_result.metrics_table)\n", + "\n", + "\n", + "def display_drilldown(row: pd.Series) -> None:\n", + " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", + "\n", + " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", + "\n", + " if not (\n", + " isinstance(row[\"predicted_trajectory\"], list)\n", + " and isinstance(row[\"reference_trajectory\"], list)\n", + " ):\n", + " return\n", + "\n", + " for predicted_trajectory, reference_trajectory in zip(\n", + " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", + " ):\n", + " display(\n", + " HTML(\n", + " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", + " )\n", + " )\n", + "\n", + " if not (\n", + " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", + " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", + " ):\n", + " continue\n", + "\n", + " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", + " print(\"Tool Input Key: \", tool_input_key)\n", + "\n", + " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " reference_trajectory[\"tool_input\"][tool_input_key],\n", + " )\n", + " else:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " \"N/A\",\n", + " )\n", + " print(\"\\n\")\n", + " display(HTML(\"
\"))\n", + "\n", + "\n", + "def display_dataframe_rows(\n", + " df: pd.DataFrame,\n", + " columns: list[str] | None = None,\n", + " num_rows: int = 3,\n", + " display_drilldown: bool = False,\n", + ") -> None:\n", + " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", + "\n", + " if columns:\n", + " df = df[columns]\n", + "\n", + " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", + " header_style = base_style + \"font-weight: bold;\"\n", + "\n", + " for _, row in df.head(num_rows).iterrows():\n", + " for column in df.columns:\n", + " display(\n", + " HTML(\n", + " f\"{column.replace('_', ' ').title()}: \"\n", + " )\n", + " )\n", + " display(HTML(f\"{row[column]}
\"))\n", + "\n", + " display(HTML(\"
\"))\n", + "\n", + " if (\n", + " display_drilldown\n", + " and \"predicted_trajectory\" in df.columns\n", + " and \"reference_trajectory\" in df.columns\n", + " ):\n", + " display_drilldown(row)\n", + "\n", + "\n", + "def plot_bar_plot(\n", + " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", + ") -> None:\n", + " fig = go.Figure()\n", + " data = []\n", + "\n", + " summary_metrics = eval_result.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " data.append(\n", + " go.Bar(\n", + " x=list(summary_metrics.keys()),\n", + " y=list(summary_metrics.values()),\n", + " name=title,\n", + " )\n", + " )\n", + "\n", + " fig = go.Figure(data=data)\n", + "\n", + " # Change the bar mode\n", + " fig.update_layout(barmode=\"group\")\n", + " fig.show()\n", + "\n", + "\n", + "def display_radar_plot(eval_results, title: str, metrics=None):\n", + " \"\"\"Plot the radar plot.\"\"\"\n", + " fig = go.Figure()\n", + " summary_metrics = eval_results.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " min_val = min(summary_metrics.values())\n", + " max_val = max(summary_metrics.values())\n", + "\n", + " fig.add_trace(\n", + " go.Scatterpolar(\n", + " r=list(summary_metrics.values()),\n", + " theta=list(summary_metrics.keys()),\n", + " fill=\"toself\",\n", + " name=title,\n", + " )\n", + " )\n", + " fig.update_layout(\n", + " title=title,\n", + " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", + " showlegend=True,\n", + " )\n", + " fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bDaa2Mtsifmq" + }, + "source": [ + "## Build an agent using Vertex AI Reasoning Engine's customized template\n", + "\n", + "Build and deploy your application using LangGraph, including the Gemini model and custom tools that you define.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KHwShhpOitKp" + }, + "source": [ + "### Set tools\n", + "\n", + "To start, set the tools that a customer support agent needs to do their job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gA2ZKvfeislw" + }, + "outputs": [], + "source": [ + "@tool\n", + "def get_product_details(product_name: str):\n", + " \"\"\"Gathers basic details about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", + " \"usb charger\": \"A super fast and light usb charger\",\n", + " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", + " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", + " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " }\n", + " return details.get(product_name, \"Product details not found.\")\n", + "\n", + "\n", + "@tool\n", + "def get_product_price(product_name: str):\n", + " \"\"\"Gathers price about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": 500,\n", + " \"usb charger\": 10,\n", + " \"shoes\": 100,\n", + " \"headphones\": 50,\n", + " \"speaker\": 80,\n", + " }\n", + " return details.get(product_name, \"Product price not found.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "be70714d9fae" + }, + "source": [ + "### Define router\n", + "\n", + "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "516b5108d327" + }, + "outputs": [], + "source": [ + "def router(\n", + " state: list[BaseMessage],\n", + ") -> Literal[\"get_product_details\", \"get_product_price\", \"__end__\"]:\n", + " \"\"\"Initiates product details or price retrieval if the user asks for a product.\"\"\"\n", + " # Get the tool_calls from the last message in the conversation history.\n", + " tool_calls = state[-1].tool_calls\n", + "\n", + " # If there are any tool_calls\n", + " if tool_calls:\n", + " # Check the function name in the first tool call\n", + " function_name = tool_calls[0].get(\"name\")\n", + " if function_name == \"get_product_price\":\n", + " return \"get_product_price\"\n", + " else:\n", + " return \"get_product_details\"\n", + " else:\n", + " # End the conversation flow.\n", + " return \"__end__\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FHjhBVx2cHWb" + }, + "source": [ + "### Set the model\n", + "\n", + "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iCx9hbpccHWc" + }, + "outputs": [], + "source": [ + "model = \"gemini-1.5-pro\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNlAY9cojEWz" + }, + "source": [ + "### Assemble the agent\n", + "\n", + "To create a LangGraph agent using [Vertex AI Reasoning Engine](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy), use the [customized template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize).\n", + "\n", + "This class helps you quickly get an agent using any framework running Vertex AI Reasoning Engine.\n", + "\n", + "To learn more about the template, check out [Customize an application template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize) documentation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dAFdi7SujGP8" + }, + "outputs": [], + "source": [ + "class LangGraphApp:\n", + " def __init__(self, project: str, location: str, model: str = model) -> None:\n", + " self.project_id = project\n", + " self.location = location\n", + " self.model = model\n", + "\n", + " # The set_up method is used to define application initialization logic\n", + " def set_up(self) -> None:\n", + " model = ChatVertexAI(model=self.model)\n", + " builder = MessageGraph()\n", + "\n", + " model_with_tools = model.bind_tools([get_product_details, get_product_price])\n", + " builder.add_node(\"tools\", model_with_tools)\n", + "\n", + " tool_node = ToolNode([get_product_details, get_product_price])\n", + " builder.add_node(\"get_product_details\", tool_node)\n", + " builder.add_node(\"get_product_price\", tool_node)\n", + " builder.add_edge(\"get_product_details\", END)\n", + " builder.add_edge(\"get_product_price\", END)\n", + "\n", + " builder.set_entry_point(\"tools\")\n", + " builder.add_conditional_edges(\"tools\", router)\n", + " self.app = builder.compile()\n", + "\n", + " # The query method will be used to send inputs to the agent\n", + " def query(self, input: str):\n", + " \"\"\"Query the application.\"\"\"\n", + " chat_history = langchain_load_dump.dumpd(self.app.invoke(HumanMessage(input)))\n", + " return chat_history\n", + " # return {'output': parse_messages_to_output_dictionary(chat_history)}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_HGcs6PVjRj_" + }, + "source": [ + "### Test the local agent\n", + "\n", + "Query your agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1dXLLgBudu_L" + }, + "outputs": [], + "source": [ + "local_custom_agent = LangGraphApp(project=PROJECT_ID, location=LOCATION)\n", + "local_custom_agent.set_up()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PgkOhPmN3aCZ" + }, + "outputs": [], + "source": [ + "response = local_custom_agent.query(input=\"Get product details for shoes\")\n", + "display(\n", + " Markdown(format_output_as_markdown(parse_messages_to_output_dictionary(response)))\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lGb58OJkjUs9" + }, + "outputs": [], + "source": [ + "response = local_custom_agent.query(input=\"Get product price for shoes\")\n", + "display(\n", + " Markdown(format_output_as_markdown(parse_messages_to_output_dictionary(response)))\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2pSItXD5e4QD" + }, + "source": [ + "### Deploy the local agent to Vertex AI Reasoning Engine\n", + "\n", + "To deploy the local agent on Vertex AI Reasoning Engine, you can use the `create` method by passing the agent and some specify dependencies (`requirements` for external PyPI packages and `extra_packages` for local packages ).\n", + "\n", + "Look at [Deploy the application](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy#create_a_reasoningengine_instance) documentation page to learn more. \n", + "\n", + "> The agent deployment on Vertex AI Reasoning Engine would require ~ 10 mins." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3HLz_a1We4QE" + }, + "outputs": [], + "source": [ + "local_custom_agent = LangGraphApp(project=PROJECT_ID, location=LOCATION)\n", + "\n", + "remote_custom_agent = reasoning_engines.ReasoningEngine.create(\n", + " local_custom_agent,\n", + " requirements=[\n", + " \"google-cloud-aiplatform[langchain,reasoningengine]\",\n", + " \"langchain_google_vertexai\",\n", + " \"langgraph\",\n", + " \"cloudpickle==3.0.0\",\n", + " \"pydantic==2.7.4\",\n", + " \"requests\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nu4RO1P9e4QE" + }, + "source": [ + "### Test the remote agent\n", + "\n", + "Query your remote agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sqBtzYyce4QE" + }, + "outputs": [], + "source": [ + "response = remote_custom_agent.query(input=\"Get product details for shoes\")\n", + "display(\n", + " Markdown(format_output_as_markdown(parse_messages_to_output_dictionary(response)))\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aOGPePsorpUl" + }, + "source": [ + "## Evaluating a custom agent with Vertex AI Gen AI Evaluation\n", + "\n", + "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", + "\n", + "Monitoring focuses on how well your agent is performing specific tasks:\n", + "\n", + "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", + "\n", + "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", + "\n", + "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", + "\n", + "Observability is about understanding the overall health of the agent:\n", + "\n", + "* **Latency**: How long does it take the agent to respond?\n", + "\n", + "* **Failure Rate**: How often does the agent fail to produce a response?\n", + "\n", + "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e43229f3ad4f" + }, + "source": [ + "### Prepare Agent Evaluation dataset\n", + "\n", + "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", + "\n", + "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", + "\n", + "\n", + "> Optionally, you can provide both generated responses and predicted trajectory (**bring-your-own-dataset scenario**).\n", + "\n", + "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fFf8uTdUiDt3" + }, + "outputs": [], + "source": [ + "eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + "}\n", + "\n", + "eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PQEI1EcfvFHb" + }, + "source": [ + "Print some samples from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EjsonqWWvIvE" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(eval_sample_dataset, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "htCrOS9fRVi8" + }, + "source": [ + "### Prepare an Agent function\n", + "\n", + "In this scenario with a custom agent, you need an agent function to parse the agent output and pass it to Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GdO56MIDRZri" + }, + "outputs": [], + "source": [ + "def agent_parsed_response(input: str) -> dict:\n", + " \"\"\"Parse the agent output and pass it to Vertex AI Gen AI Evaluation.\"\"\"\n", + "\n", + " result = remote_custom_agent.query(input=input)\n", + "\n", + " # Parse function calls separately\n", + " agent_output = parse_messages_to_output_dictionary(result)\n", + "\n", + " return agent_output" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m4CvBuf1afHG" + }, + "source": [ + "### Single tool usage evaluation\n", + "\n", + "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_rS5GGKHd5bx" + }, + "source": [ + "#### Set single tool usage metrics\n", + "\n", + "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", + "\n", + "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xixvq8dwd5by" + }, + "outputs": [], + "source": [ + "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ktKZoT2Qd5by" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using a local agent (local_1p_agent) and assigns a unique identifier to this specific evaluation run, storing the evaluation results.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QaMf9dqzySE6" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SRv43fDcd5by" + }, + "outputs": [], + "source": [ + "single_tool_call_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=single_tool_usage_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", + " runnable=agent_parsed_response, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(single_tool_call_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6o5BjSTFKVMS" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Use some helper functions to visualize a sample of evaluation result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZkpwPReipekr" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JlujdJpu5Kn6" + }, + "source": [ + "### Trajectory Evaluation\n", + "\n", + "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8s-nHdDJneHM" + }, + "source": [ + "#### Set trajectory metrics\n", + "\n", + "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", + "\n", + "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", + "\n", + "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", + "\n", + "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", + "\n", + "* `trajectory_precision`: proportion of predicted actions present in reference\n", + "\n", + "* `trajectory_recall`: proportion of reference actions present in predicted. \n", + "\n", + "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c32WIS95neHN" + }, + "outputs": [], + "source": [ + "trajectory_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"trajectory_any_order_match\",\n", + " \"trajectory_precision\",\n", + " \"trajectory_recall\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF3jhTH3neHN" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vOdS7TJUneHN" + }, + "outputs": [], + "source": [ + "EXPERIMENT_NAME = \"evaluate-re-agent-trajectory\" # @param {type:\"string\"}\n", + "\n", + "trajectory_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=agent_parsed_response)\n", + "\n", + "display_eval_report(trajectory_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DBiUI3LyLBtj" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Print and visualize a sample of evaluation results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sLVRdN5llA0h" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PrxM5sMZYXHP" + }, + "outputs": [], + "source": [ + "plot_bar_plot(\n", + " trajectory_eval_result,\n", + " title=\"Trajectory Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T8TipU2akHEd" + }, + "source": [ + "### Evaluate final response\n", + "\n", + "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DeK-py7ykkDN" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", + "\n", + "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cyGHGgeVklvz" + }, + "outputs": [], + "source": [ + "response_metrics = [\"safety\", \"coherence\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DaBJWcg1kn55" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wRb2EC_hknSD" + }, + "outputs": [], + "source": [ + "EXPERIMENT_NAME = \"evaluate-re-agent-response\" # @param {type:\"string\"}\n", + "\n", + "response_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "response_eval_result = response_eval_task.evaluate(runnable=agent_parsed_response)\n", + "\n", + "display_eval_report(response_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JtewTwiwg9qH" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cy0aRydrp9zW" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ntRBK3Te6PEc" + }, + "source": [ + "### Evaluate generated response conditioned by tool choosing\n", + "\n", + "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", + "\n", + "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4bENwFcd6prX" + }, + "source": [ + "#### Define a custom metric\n", + "\n", + "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", + "\n", + "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "txGEHcg76riI" + }, + "outputs": [], + "source": [ + "criteria = {\n", + " \"Follows trajectory\": (\n", + " \"Evaluate whether the agent's response logically follows from the \"\n", + " \"sequence of actions it took. Consider these sub-points:\\n\"\n", + " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", + " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", + " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", + " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", + " )\n", + "}\n", + "\n", + "pointwise_rating_rubric = {\n", + " \"1\": \"Follows trajectory\",\n", + " \"0\": \"Does not follow trajectory\",\n", + "}\n", + "\n", + "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", + " criteria=criteria,\n", + " rating_rubric=pointwise_rating_rubric,\n", + " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8MJqXu0kikxd" + }, + "source": [ + "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5EL7iEDMikNQ" + }, + "outputs": [], + "source": [ + "print(response_follows_trajectory_prompt_template.prompt_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e1djVp7Fi4Yy" + }, + "source": [ + "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nx1xbZD87iMj" + }, + "outputs": [], + "source": [ + "response_follows_trajectory_metric = PointwiseMetric(\n", + " metric=\"response_follows_trajectory\",\n", + " metric_prompt_template=response_follows_trajectory_prompt_template,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1pmxLwTe7Ywv" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "Set new generated response evaluation metrics by including the custom metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wrsbVFDd7Ywv" + }, + "outputs": [], + "source": [ + "response_tool_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"safety\",\n", + " response_follows_trajectory_metric,\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lo-Sza807Ywv" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_dkb4gSn7Ywv" + }, + "outputs": [], + "source": [ + "EXPERIMENT_NAME = \"evaluate-re-agent-response-by-tools\" # @param {type:\"string\"}\n", + "\n", + "response_eval_tool_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "response_eval_tool_result = response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_response\n", + ")\n", + "\n", + "display_eval_report(response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EFmnRBlWqJnC" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZODTRuq2lF75" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4nuUDP3a2eTB" + }, + "source": [ + "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", + "\n", + "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pNh3-NDuZGDl" + }, + "source": [ + "### Bring your own evaluation dataset\n", + "\n", + "Define the evaluation dataset with the predicted trajectory and the generated response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y9hBgsg324Ej" + }, + "outputs": [], + "source": [ + "byod_eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"generated_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"response\": [\n", + " 500,\n", + " 50,\n", + " \"A super fast and light usb charger\",\n", + " 100,\n", + " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " ],\n", + "}\n", + "\n", + "byod_eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oEYmU2eJ7q-1" + }, + "source": [ + "### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wBD-4wpB7q-3" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", + "\n", + "byod_response_eval_tool_task = EvalTask(\n", + " dataset=byod_eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_response, experiment_run_name=EXPERIMENT_RUN_NAME\n", + ")\n", + "\n", + "display_eval_report(byod_response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9eU3LG6r7q-3" + }, + "source": [ + "### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pQFzmd2I7q-3" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0FEbvEOkZS8f" + }, + "outputs": [], + "source": [ + "display_radar_plot(\n", + " byod_response_eval_tool_result,\n", + " title=\"Response Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2a4e033321ad" + }, + "source": [ + "## Cleaning up\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ox2I3UfRlTOd" + }, + "outputs": [], + "source": [ + "delete_experiment = True\n", + "delete_remote_agent = True\n", + "\n", + "if delete_experiment:\n", + " try:\n", + " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", + " experiment.delete(delete_backing_tensorboard_runs=True)\n", + " except Exception as e:\n", + " print(e)\n", + "\n", + "if delete_remote_agent:\n", + " try:\n", + " remote_custom_agent.delete()\n", + " except Exception as e:\n", + " print(e)" + ] + } + ], + "metadata": { + "colab": { + "name": "evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb", + "toc_visible": true + }, + "environment": { + "kernel": "python3", + "name": "common-cpu.m126", + "type": "gcloud", + "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/base-cpu:m126" + }, + "kernelspec": { + "display_name": "Python 3 (Local)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } From f56ab9c702a31df0c194301a3796eed343ac14e3 Mon Sep 17 00:00:00 2001 From: inardini Date: Wed, 18 Dec 2024 10:14:40 +0000 Subject: [PATCH 05/14] remove bucket name --- ...ting_crewai_agent_reasoning_engine_customized_template.ipynb | 2 -- ...ing_langchain_agent_reasoning_engine_prebuilt_template.ipynb | 1 - ...g_langgraph_agent_reasoning_engine_customized_template.ipynb | 2 -- 3 files changed, 5 deletions(-) diff --git a/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb b/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb index 645877010d..af9a26cc89 100644 --- a/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb +++ b/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb @@ -252,8 +252,6 @@ "if not BUCKET_NAME or BUCKET_NAME == \"[your-bucket-name]\":\n", " BUCKET_NAME = f\"{PROJECT_ID}-bucket\"\n", "\n", - "BUCKET_URI = f\"gs://evaluate_agents\"\n", - "\n", "! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI\n", "\n", "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", diff --git a/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb b/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb index 7db11bfc2d..5529964e55 100644 --- a/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb +++ b/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb @@ -248,7 +248,6 @@ "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", "\n", "BUCKET_NAME = \"[your-bucket-name]\" # @param {type: \"string\", placeholder: \"[your-bucket-name]\", isTemplate: true}\n", - "BUCKET_URI = f\"gs://evaluate_agents\"\n", "\n", "if not BUCKET_NAME or BUCKET_NAME == \"[your-bucket-name]\":\n", " BUCKET_NAME = f\"{PROJECT_ID}-bucket\"\n", diff --git a/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb b/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb index 144c7edec4..062f6cb730 100644 --- a/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb +++ b/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb @@ -253,8 +253,6 @@ "if not BUCKET_NAME or BUCKET_NAME == \"[your-bucket-name]\":\n", " BUCKET_NAME = f\"{PROJECT_ID}-bucket\"\n", "\n", - "BUCKET_URI = f\"gs://evaluate_agents\"\n", - "\n", "! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI\n", "\n", "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", From eeb11f8be6f701e0ab89d9e30805f0279087b489 Mon Sep 17 00:00:00 2001 From: inardini Date: Wed, 18 Dec 2024 10:30:01 +0000 Subject: [PATCH 06/14] fix issue --- .../evaluation/evaluating_crewai_agent.ipynb | 27 +++++++++---------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/gemini/evaluation/evaluating_crewai_agent.ipynb b/gemini/evaluation/evaluating_crewai_agent.ipynb index 3dffdfa464..10e073157a 100644 --- a/gemini/evaluation/evaluating_crewai_agent.ipynb +++ b/gemini/evaluation/evaluating_crewai_agent.ipynb @@ -328,24 +328,23 @@ " Parse CrewAI output into a structured dictionary format.\n", " \"\"\"\n", " final_output = {\"response\": str(crew_output), \"predicted_trajectory\": []}\n", - "\n", - " try:\n", - " # Access tools_results directly from each agent\n", - " for agent in crew.agents:\n", - " if hasattr(agent, \"tools_results\"):\n", - " for tool_result in agent.tools_results:\n", - " tool_info = {\n", - " \"tool_name\": tool_result.get(\"tool_name\", \"\"),\n", - " \"tool_input\": tool_result.get(\"tool_args\", {}),\n", - " }\n", - " final_output[\"predicted_trajectory\"].append(tool_info)\n", - "\n", - " except Exception as e:\n", - " final_output[\"error\"] = f\"Error parsing tools results: {str(e)}\"\n", + " \n", + " for agent in crew.agents:\n", + " try:\n", + " for tool_result in agent.tools_results:\n", + " tool_info = {\n", + " \"tool_name\": tool_result.get(\"tool_name\", \"\"),\n", + " \"tool_input\": tool_result.get(\"tool_args\", {}),\n", + " }\n", + " final_output[\"predicted_trajectory\"].append(tool_info)\n", + " except AttributeError as e:\n", + " final_output[\"error\"] = f\"Agent does not have tools_results: {str(e)}\"\n", + " print(f\"Error: {e}\")\n", "\n", " return final_output\n", "\n", "\n", + "\n", "def format_output_as_markdown(output: dict) -> str:\n", " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", " markdown = \"### AI Response\\n\"\n", From e02e3e229377000b36c97600aeb101d632dcd753 Mon Sep 17 00:00:00 2001 From: inardini Date: Wed, 18 Dec 2024 10:34:11 +0000 Subject: [PATCH 07/14] add text --- .github/actions/spelling/allow.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/actions/spelling/allow.txt b/.github/actions/spelling/allow.txt index 94377b0d0e..d7cd989da8 100644 --- a/.github/actions/spelling/allow.txt +++ b/.github/actions/spelling/allow.txt @@ -1,3 +1,9 @@ +byod +crewai +drilldown +dumpd +runnning +usb AALR ADMA AFX From 03ba1301bd6b4a3dbfaed12b446dba2abec97a60 Mon Sep 17 00:00:00 2001 From: inardini Date: Wed, 18 Dec 2024 10:37:22 +0000 Subject: [PATCH 08/14] fix typos --- gemini/evaluation/evaluating_crewai_agent.ipynb | 2 +- gemini/evaluation/evaluating_langgraph_agent.ipynb | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gemini/evaluation/evaluating_crewai_agent.ipynb b/gemini/evaluation/evaluating_crewai_agent.ipynb index 10e073157a..9514d61592 100644 --- a/gemini/evaluation/evaluating_crewai_agent.ipynb +++ b/gemini/evaluation/evaluating_crewai_agent.ipynb @@ -635,7 +635,7 @@ "source": [ "### Assemble the agent\n", "\n", - "The Vertex AI GenAI Evaluation works directly with 'Queryable' agents, and also lets you add your own custom functions with a specific structure (signature).\n", + "The Vertex AI Gen AI Evaluation works directly with 'Queryable' agents, and also lets you add your own custom functions with a specific structure (signature).\n", "\n", "In this case, you assemble the agent using a custom function. The function triggers the agent for a given input and parse the agent outcome to extract the response and called tools." ] diff --git a/gemini/evaluation/evaluating_langgraph_agent.ipynb b/gemini/evaluation/evaluating_langgraph_agent.ipynb index c17a6ba09e..03abd9d202 100644 --- a/gemini/evaluation/evaluating_langgraph_agent.ipynb +++ b/gemini/evaluation/evaluating_langgraph_agent.ipynb @@ -29,7 +29,7 @@ "id": "JAPoU8Sm5E6e" }, "source": [ - "# Evaluating Agents - Evaluate an LangGraph agent with Vertex AI Gen AI Evaluation\n", + "# Evaluating Agents - Evaluate a LangGraph agent with Vertex AI Gen AI Evaluation\n", "\n", "\n", "
\n", @@ -647,7 +647,7 @@ "source": [ "### Assemble the agent\n", "\n", - "The Vertex AI GenAI Evaluation works directly with 'Queryable' agents, and also lets you add your own custom functions with a specific structure (signature).\n", + "The Vertex AI Gen AI Evaluation works directly with 'Queryable' agents, and also lets you add your own custom functions with a specific structure (signature).\n", "\n", "In this case, you assemble the agent using a custom function. The function triggers the agent for a given input and parse the agent outcome to extract the response and called tools." ] From 18ab84d36b41f4a519a67dc279126e200b6ca004 Mon Sep 17 00:00:00 2001 From: inardini Date: Wed, 18 Dec 2024 10:44:12 +0000 Subject: [PATCH 09/14] fix typos --- ..._crewai_agent_reasoning_engine_customized_template.ipynb | 4 ++-- ...langchain_agent_reasoning_engine_prebuilt_template.ipynb | 6 +++--- ...nggraph_agent_reasoning_engine_customized_template.ipynb | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb b/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb index af9a26cc89..7777f7d7d4 100644 --- a/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb +++ b/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb @@ -29,7 +29,7 @@ "id": "JAPoU8Sm5E6e" }, "source": [ - "# Evaluate an CrewAI agent on Vertex AI Reasoning Engine (Customized template)\n", + "# Evaluate a CrewAI agent on Vertex AI Reasoning Engine (Customized template)\n", "\n", "\n", "
\n", @@ -100,7 +100,7 @@ "\n", "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", "\n", - "This tutorial shows how to evaluate an CrewAI agent with customized template on Vertex AI Reasoning Engine using Vertex AI Gen AI Evaluation.\n", + "This tutorial shows how to evaluate a CrewAI agent with customized template on Vertex AI Reasoning Engine using Vertex AI Gen AI Evaluation.\n", "\n", "The tutorial uses the following Google Cloud services and resources:\n", "\n", diff --git a/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb b/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb index 5529964e55..eb08e2b749 100644 --- a/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb +++ b/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb @@ -29,7 +29,7 @@ "id": "JAPoU8Sm5E6e" }, "source": [ - "# Evaluating an LangChain Agent on Vertex AI Reasoning Engine (Prebuilt template)\n", + "# Evaluating a LangChain Agent on Vertex AI Reasoning Engine (Prebuilt template)\n", "\n", "\n", "
\n", @@ -494,7 +494,7 @@ "source": [ "### Set tools\n", "\n", - "To start, create some tools the agent will need to do their job. We are just going to pretend there's a database for this colab, but you would wire into your database or third party system for a real agent." + "To start, create some tools the agent will need to do their job. We are just going to pretend there's a database for this Colab, but you would wire into your database or third party system for a real agent." ] }, { @@ -563,7 +563,7 @@ "\n", "> Note the additional parameter `agent_executor_kwargs` which would allow to return tool calls made by agent so you can evaluate them.\n", "\n", - "The Vertex AI GenAI Evaluation works directly with 'Queryable' agents (like in this case), and also lets you add your own custom functions with a specific structure (signature)." + "The Vertex AI Gen AI Evaluation works directly with 'Queryable' agents (like in this case), and also lets you add your own custom functions with a specific structure (signature)." ] }, { diff --git a/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb b/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb index 062f6cb730..0c9223fe66 100644 --- a/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb +++ b/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb @@ -29,7 +29,7 @@ "id": "JAPoU8Sm5E6e" }, "source": [ - "# Evaluate an LangGraph agent on Vertex AI Reasoning Engine (Customized template)\n", + "# Evaluate a LangGraph agent on Vertex AI Reasoning Engine (Customized template)\n", "\n", "\n", "
\n", @@ -100,7 +100,7 @@ "\n", "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", "\n", - "This tutorial shows how to evaluate an LangGraph agent with customized template on Vertex AI Reasoning Engine using Vertex AI Gen AI Evaluation.\n", + "This tutorial shows how to evaluate a LangGraph agent with customized template on Vertex AI Reasoning Engine using Vertex AI Gen AI Evaluation.\n", "\n", "The tutorial uses the following Google Cloud services and resources:\n", "\n", From 9cdb97a6a4b79724ec773c06b17460a743973461 Mon Sep 17 00:00:00 2001 From: inardini Date: Wed, 18 Dec 2024 13:44:22 +0000 Subject: [PATCH 10/14] fix typo --- gemini/evaluation/evaluating_crewai_agent.ipynb | 6 ++---- gemini/evaluation/evaluating_langgraph_agent.ipynb | 6 ++---- ..._crewai_agent_reasoning_engine_customized_template.ipynb | 6 ++---- ...langchain_agent_reasoning_engine_prebuilt_template.ipynb | 6 ++---- ...nggraph_agent_reasoning_engine_customized_template.ipynb | 6 ++---- 5 files changed, 10 insertions(+), 20 deletions(-) diff --git a/gemini/evaluation/evaluating_crewai_agent.ipynb b/gemini/evaluation/evaluating_crewai_agent.ipynb index 9514d61592..1e32231dc4 100644 --- a/gemini/evaluation/evaluating_crewai_agent.ipynb +++ b/gemini/evaluation/evaluating_crewai_agent.ipynb @@ -1409,7 +1409,7 @@ " }\n", " ],\n", " ],\n", - " \"generated_trajectory\": [\n", + " \"predicted_trajectory\": [\n", " [\n", " {\n", " \"tool_name\": \"get_product_price\",\n", @@ -1485,9 +1485,7 @@ " experiment=EXPERIMENT_NAME,\n", ")\n", "\n", - "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", - " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN_NAME\n", - ")\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(experiment_run_name=EXPERIMENT_RUN_NAME)\n", "\n", "display_eval_report(byod_response_eval_tool_result)" ] diff --git a/gemini/evaluation/evaluating_langgraph_agent.ipynb b/gemini/evaluation/evaluating_langgraph_agent.ipynb index 03abd9d202..a5f0d5db30 100644 --- a/gemini/evaluation/evaluating_langgraph_agent.ipynb +++ b/gemini/evaluation/evaluating_langgraph_agent.ipynb @@ -1398,7 +1398,7 @@ " }\n", " ],\n", " ],\n", - " \"generated_trajectory\": [\n", + " \"predicted_trajectory\": [\n", " [\n", " {\n", " \"tool_name\": \"get_product_price\",\n", @@ -1474,9 +1474,7 @@ " experiment=EXPERIMENT_NAME,\n", ")\n", "\n", - "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", - " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN_NAME\n", - ")\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(experiment_run_name=EXPERIMENT_RUN_NAME)\n", "\n", "display_eval_report(byod_response_eval_tool_result)" ] diff --git a/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb b/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb index 7777f7d7d4..3a653cc971 100644 --- a/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb +++ b/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb @@ -1524,7 +1524,7 @@ " }\n", " ],\n", " ],\n", - " \"generated_trajectory\": [\n", + " \"predicted_trajectory\": [\n", " [\n", " {\n", " \"tool_name\": \"get_product_price\",\n", @@ -1600,9 +1600,7 @@ " experiment=EXPERIMENT_NAME,\n", ")\n", "\n", - "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", - " runnable=agent_parsed_response, experiment_run_name=EXPERIMENT_RUN_NAME\n", - ")\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(experiment_run_name=EXPERIMENT_RUN_NAME)\n", "\n", "display_eval_report(byod_response_eval_tool_result)" ] diff --git a/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb b/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb index eb08e2b749..587f588375 100644 --- a/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb +++ b/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb @@ -1358,7 +1358,7 @@ " }\n", " ],\n", " ],\n", - " \"generated_trajectory\": [\n", + " \"predicted_trajectory\": [\n", " [\n", " {\n", " \"tool_name\": \"get_product_price\",\n", @@ -1434,9 +1434,7 @@ " experiment=EXPERIMENT_NAME,\n", ")\n", "\n", - "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", - " runnable=remote_1p_agent, experiment_run_name=EXPERIMENT_RUN_NAME\n", - ")\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(experiment_run_name=EXPERIMENT_RUN_NAME)\n", "\n", "display_eval_report(byod_response_eval_tool_result)" ] diff --git a/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb b/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb index 0c9223fe66..dea5501ca0 100644 --- a/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb +++ b/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb @@ -1521,7 +1521,7 @@ " }\n", " ],\n", " ],\n", - " \"generated_trajectory\": [\n", + " \"predicted_trajectory\": [\n", " [\n", " {\n", " \"tool_name\": \"get_product_price\",\n", @@ -1597,9 +1597,7 @@ " experiment=EXPERIMENT_NAME,\n", ")\n", "\n", - "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", - " runnable=agent_parsed_response, experiment_run_name=EXPERIMENT_RUN_NAME\n", - ")\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(experiment_run_name=EXPERIMENT_RUN_NAME)\n", "\n", "display_eval_report(byod_response_eval_tool_result)" ] From 57f92b7d0cafff0d31082e6f5c6ec01410ac62ea Mon Sep 17 00:00:00 2001 From: inardini Date: Wed, 18 Dec 2024 13:46:09 +0000 Subject: [PATCH 11/14] fix name typo --- gemini/evaluation/evaluating_crewai_agent.ipynb | 2 +- gemini/evaluation/evaluating_langgraph_agent.ipynb | 2 +- ...ting_crewai_agent_reasoning_engine_customized_template.ipynb | 2 +- ...ing_langchain_agent_reasoning_engine_prebuilt_template.ipynb | 2 +- ...g_langgraph_agent_reasoning_engine_customized_template.ipynb | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/gemini/evaluation/evaluating_crewai_agent.ipynb b/gemini/evaluation/evaluating_crewai_agent.ipynb index 1e32231dc4..74a215d403 100644 --- a/gemini/evaluation/evaluating_crewai_agent.ipynb +++ b/gemini/evaluation/evaluating_crewai_agent.ipynb @@ -1522,7 +1522,7 @@ "source": [ "display_radar_plot(\n", " byod_response_eval_tool_result,\n", - " title=\"Response Metrics\",\n", + " title=\"Agent evaluation metrics\",\n", " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", ")" ] diff --git a/gemini/evaluation/evaluating_langgraph_agent.ipynb b/gemini/evaluation/evaluating_langgraph_agent.ipynb index a5f0d5db30..bc2a922f6d 100644 --- a/gemini/evaluation/evaluating_langgraph_agent.ipynb +++ b/gemini/evaluation/evaluating_langgraph_agent.ipynb @@ -1511,7 +1511,7 @@ "source": [ "display_radar_plot(\n", " byod_response_eval_tool_result,\n", - " title=\"Response Metrics\",\n", + " title=\"Agent evaluation metrics\",\n", " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", ")" ] diff --git a/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb b/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb index 3a653cc971..1f701dac00 100644 --- a/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb +++ b/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb @@ -1637,7 +1637,7 @@ "source": [ "display_radar_plot(\n", " byod_response_eval_tool_result,\n", - " title=\"Response Metrics\",\n", + " title=\"Agent evaluation metrics\",\n", " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", ")" ] diff --git a/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb b/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb index 587f588375..3147b5ea23 100644 --- a/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb +++ b/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb @@ -1471,7 +1471,7 @@ "source": [ "display_radar_plot(\n", " byod_response_eval_tool_result,\n", - " title=\"Response Metrics\",\n", + " title=\"Agent evaluation metrics\",\n", " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", ")" ] diff --git a/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb b/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb index dea5501ca0..0c5037d73d 100644 --- a/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb +++ b/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb @@ -1634,7 +1634,7 @@ "source": [ "display_radar_plot(\n", " byod_response_eval_tool_result,\n", - " title=\"Response Metrics\",\n", + " title=\"Agent evaluation metrics\",\n", " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", ")" ] From 9b6274773742d5a0d7c9c6c548ec7105ededdcfe Mon Sep 17 00:00:00 2001 From: Holt Skinner Date: Wed, 18 Dec 2024 09:53:29 -0600 Subject: [PATCH 12/14] Formatting --- .github/actions/spelling/allow.txt | 12 +- .../evaluation/evaluating_crewai_agent.ipynb | 3156 ++++++++------- .../evaluating_langgraph_agent.ipynb | 3135 ++++++++------- ...reasoning_engine_customized_template.ipynb | 3401 ++++++++--------- ...t_reasoning_engine_prebuilt_template.ipynb | 3069 ++++++++------- ...reasoning_engine_customized_template.ipynb | 3395 ++++++++-------- 6 files changed, 8041 insertions(+), 8127 deletions(-) diff --git a/.github/actions/spelling/allow.txt b/.github/actions/spelling/allow.txt index d7cd989da8..f404995759 100644 --- a/.github/actions/spelling/allow.txt +++ b/.github/actions/spelling/allow.txt @@ -1,9 +1,3 @@ -byod -crewai -drilldown -dumpd -runnning -usb AALR ADMA AFX @@ -595,6 +589,7 @@ bqdf bqml breakroom btn +byod byor carbonara cashify @@ -639,6 +634,7 @@ constexpr corpuses countplot cpet +crewai csa cse ctd @@ -675,6 +671,7 @@ doi dotprompt dpi draig +drilldown drinkware dropdown dropna @@ -682,6 +679,7 @@ dsl dtype dtypes dumfries +dumpd dwmapi ecommerce ekg @@ -1058,6 +1056,7 @@ rrf rsc rsp runjdwp +runnning saaagesh saveddir scann @@ -1154,6 +1153,7 @@ unigram unrtf upsell urandom +usb usebackq usecases username diff --git a/gemini/evaluation/evaluating_crewai_agent.ipynb b/gemini/evaluation/evaluating_crewai_agent.ipynb index 74a215d403..471f95ebea 100644 --- a/gemini/evaluation/evaluating_crewai_agent.ipynb +++ b/gemini/evaluation/evaluating_crewai_agent.ipynb @@ -1,1589 +1,1571 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ur8xi4C7S06n" - }, - "outputs": [], - "source": [ - "# Copyright 2024 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JAPoU8Sm5E6e" - }, - "source": [ - "# Evaluating Agents - Evaluate a CrewAI agent with Vertex AI Gen AI Evaluation\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \"Google
Open in Colab\n", - "
\n", - "
\n", - " \n", - " \"Google
Open in Colab Enterprise\n", - "
\n", - "
\n", - " \n", - " \"Vertex
Open in Vertex AI Workbench\n", - "
\n", - "
\n", - " \n", - " \"GitHub
View on GitHub\n", - "
\n", - "
\n", - "\n", - "
\n", - "\n", - "Share to:\n", - "\n", - "\n", - " \"LinkedIn\n", - "\n", - "\n", - "\n", - " \"Bluesky\n", - "\n", - "\n", - "\n", - " \"X\n", - "\n", - "\n", - "\n", - " \"Reddit\n", - "\n", - "\n", - "\n", - " \"Facebook\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "84f0f73a0f76" - }, - "source": [ - "| | |\n", - "|-|-|\n", - "| Authors | [Ivan Nardini](https://github.com/inardini), [Naveksha Sood](https://github.com/navekshasood) |" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tvgnzT1CKxrO" - }, - "source": [ - "## Overview\n", - "\n", - "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", - "\n", - "Vertex AI Model Evaluation provides a toolkit of quality controlled and explainable methods and metrics to evaluate any generative model or application, including agents, and benchmark the evaluation results against your own judgment, using your own evaluation criteria.\n", - "\n", - "This tutorial shows how to evaluate a Crew AI agent using Vertex AI Gen AI Evaluation for agent evaluation.\n", - "\n", - "The tutorial uses the following Google Cloud services and resources:\n", - "\n", - "* Vertex AI Gen AI Evaluation\n", - "\n", - "The steps performed include:\n", - "\n", - "* Build local agent using Crew AI\n", - "* Prepare Agent Evaluation dataset\n", - "* Single tool usage evaluation\n", - "* Trajectory evaluation\n", - "* Response evaluation\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "61RBz8LLbxCR" - }, - "source": [ - "## Get started" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "No17Cw5hgx12" - }, - "source": [ - "### Install Vertex AI SDK and other required packages\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tFy3H3aPgx12" - }, - "outputs": [], - "source": [ - "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation]\" \\\n", - " \"crewai\" \"crewai-tools\" \\\n", - " \"cloudpickle==3.0.0\" \\\n", - " \"pydantic==2.7.4\" \\\n", - " \"requests\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "R5Xep4W9lq-Z" - }, - "source": [ - "### Restart runtime\n", - "\n", - "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", - "\n", - "The restart might take a minute or longer. After it's restarted, continue to the next step." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XRvKdaPDTznN" - }, - "outputs": [], - "source": [ - "import IPython\n", - "\n", - "app = IPython.Application.instance()\n", - "app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SbmM4z7FOBpM" - }, - "source": [ - "
\n", - "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", - "
\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dmWOrTJ3gx13" - }, - "source": [ - "### Authenticate your notebook environment (Colab only)\n", - "\n", - "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NyKGtVQjgx13" - }, - "outputs": [], - "source": [ - "import sys\n", - "\n", - "if \"google.colab\" in sys.modules:\n", - " from google.colab import auth\n", - "\n", - " auth.authenticate_user()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF4l8DTdWgPY" - }, - "source": [ - "### Set Google Cloud project information and initialize Vertex AI SDK\n", - "\n", - "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", - "\n", - "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nqwi-5ufWp_B" - }, - "outputs": [], - "source": [ - "# Use the environment variable if the user doesn't provide Project ID.\n", - "import os\n", - "\n", - "import vertexai\n", - "\n", - "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", - "\n", - "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", - " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", - "os.environ[\"GOOGLE_CLOUD_PROJECT\"] = PROJECT_ID\n", - "\n", - "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", - "\n", - "EXPERIMENT_NAME = \"evaluate-crewai-agent\" # @param {type:\"string\"}\n", - "\n", - "vertexai.init(project=PROJECT_ID, location=LOCATION, experiment=EXPERIMENT_NAME)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5303c05f7aa6" - }, - "source": [ - "## Import libraries\n", - "\n", - "Import tutorial libraries." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6fc324893334" - }, - "outputs": [], - "source": [ - "# General\n", - "import random\n", - "import string\n", - "import warnings\n", - "\n", - "from IPython.display import HTML, Markdown, display\n", - "import pandas as pd\n", - "import plotly.graph_objects as go\n", - "\n", - "warnings.filterwarnings(\"ignore\", category=Warning, module=\"opentelemetry.trace\")\n", - "\n", - "# Build agent\n", - "from crewai import Agent, Crew, Process, Task\n", - "from crewai.flow.flow import Flow, listen, start\n", - "from crewai_tools import tool\n", - "\n", - "# Evaluate agent\n", - "from google.cloud import aiplatform\n", - "from vertexai.preview.evaluation import EvalTask\n", - "from vertexai.preview.evaluation.metrics import (\n", - " PointwiseMetric,\n", - " PointwiseMetricPromptTemplate,\n", - " TrajectorySingleToolUse,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MVnBDX54gz7j" - }, - "source": [ - "## Define helper functions\n", - "\n", - "Initiate a set of helper functions to print tutorial results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uSgWjMD_g1_v" - }, - "outputs": [], - "source": [ - "def get_id(length: int = 8) -> str:\n", - " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", - " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", - "\n", - "\n", - "def parse_crewai_output_to_dictionary(crew, crew_output):\n", - " \"\"\"\n", - " Parse CrewAI output into a structured dictionary format.\n", - " \"\"\"\n", - " final_output = {\"response\": str(crew_output), \"predicted_trajectory\": []}\n", - " \n", - " for agent in crew.agents:\n", - " try:\n", - " for tool_result in agent.tools_results:\n", - " tool_info = {\n", - " \"tool_name\": tool_result.get(\"tool_name\", \"\"),\n", - " \"tool_input\": tool_result.get(\"tool_args\", {}),\n", - " }\n", - " final_output[\"predicted_trajectory\"].append(tool_info)\n", - " except AttributeError as e:\n", - " final_output[\"error\"] = f\"Agent does not have tools_results: {str(e)}\"\n", - " print(f\"Error: {e}\")\n", - "\n", - " return final_output\n", - "\n", - "\n", - "\n", - "def format_output_as_markdown(output: dict) -> str:\n", - " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", - " markdown = \"### AI Response\\n\"\n", - " markdown += f\"{output['response']}\\n\\n\"\n", - "\n", - " if output[\"predicted_trajectory\"]:\n", - " markdown += \"### Function Calls\\n\"\n", - " for call in output[\"predicted_trajectory\"]:\n", - " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", - " markdown += \" - **Arguments**:\\n\"\n", - " for key, value in call[\"tool_input\"].items():\n", - " markdown += f\" - `{key}`: `{value}`\\n\"\n", - "\n", - " return markdown\n", - "\n", - "\n", - "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", - " \"\"\"Display the evaluation results.\"\"\"\n", - " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", - " display(Markdown(\"### Summary Metrics\"))\n", - " display(metrics_df)\n", - "\n", - " display(Markdown(f\"### Row-wise Metrics\"))\n", - " display(eval_result.metrics_table)\n", - "\n", - "\n", - "def display_drilldown(row: pd.Series) -> None:\n", - " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", - "\n", - " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", - "\n", - " if not (\n", - " isinstance(row[\"predicted_trajectory\"], list)\n", - " and isinstance(row[\"reference_trajectory\"], list)\n", - " ):\n", - " return\n", - "\n", - " for predicted_trajectory, reference_trajectory in zip(\n", - " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", - " ):\n", - " display(\n", - " HTML(\n", - " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", - " )\n", - " )\n", - "\n", - " if not (\n", - " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", - " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", - " ):\n", - " continue\n", - "\n", - " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", - " print(\"Tool Input Key: \", tool_input_key)\n", - "\n", - " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", - " print(\n", - " \"Tool Values: \",\n", - " predicted_trajectory[\"tool_input\"][tool_input_key],\n", - " reference_trajectory[\"tool_input\"][tool_input_key],\n", - " )\n", - " else:\n", - " print(\n", - " \"Tool Values: \",\n", - " predicted_trajectory[\"tool_input\"][tool_input_key],\n", - " \"N/A\",\n", - " )\n", - " print(\"\\n\")\n", - " display(HTML(\"
\"))\n", - "\n", - "\n", - "def display_dataframe_rows(\n", - " df: pd.DataFrame,\n", - " columns: list[str] | None = None,\n", - " num_rows: int = 3,\n", - " display_drilldown: bool = False,\n", - ") -> None:\n", - " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", - "\n", - " if columns:\n", - " df = df[columns]\n", - "\n", - " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", - " header_style = base_style + \"font-weight: bold;\"\n", - "\n", - " for _, row in df.head(num_rows).iterrows():\n", - " for column in df.columns:\n", - " display(\n", - " HTML(\n", - " f\"{column.replace('_', ' ').title()}: \"\n", - " )\n", - " )\n", - " display(HTML(f\"{row[column]}
\"))\n", - "\n", - " display(HTML(\"
\"))\n", - "\n", - " if (\n", - " display_drilldown\n", - " and \"predicted_trajectory\" in df.columns\n", - " and \"reference_trajectory\" in df.columns\n", - " ):\n", - " display_drilldown(row)\n", - "\n", - "\n", - "def plot_bar_plot(\n", - " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", - ") -> None:\n", - " fig = go.Figure()\n", - " data = []\n", - "\n", - " summary_metrics = eval_result.summary_metrics\n", - " if metrics:\n", - " summary_metrics = {\n", - " k: summary_metrics[k]\n", - " for k, v in summary_metrics.items()\n", - " if any(selected_metric in k for selected_metric in metrics)\n", - " }\n", - "\n", - " data.append(\n", - " go.Bar(\n", - " x=list(summary_metrics.keys()),\n", - " y=list(summary_metrics.values()),\n", - " name=title,\n", - " )\n", - " )\n", - "\n", - " fig = go.Figure(data=data)\n", - "\n", - " # Change the bar mode\n", - " fig.update_layout(barmode=\"group\")\n", - " fig.show()\n", - "\n", - "\n", - "def display_radar_plot(eval_results, title: str, metrics=None):\n", - " \"\"\"Plot the radar plot.\"\"\"\n", - " fig = go.Figure()\n", - " summary_metrics = eval_results.summary_metrics\n", - " if metrics:\n", - " summary_metrics = {\n", - " k: summary_metrics[k]\n", - " for k, v in summary_metrics.items()\n", - " if any(selected_metric in k for selected_metric in metrics)\n", - " }\n", - "\n", - " min_val = min(summary_metrics.values())\n", - " max_val = max(summary_metrics.values())\n", - "\n", - " fig.add_trace(\n", - " go.Scatterpolar(\n", - " r=list(summary_metrics.values()),\n", - " theta=list(summary_metrics.keys()),\n", - " fill=\"toself\",\n", - " name=title,\n", - " )\n", - " )\n", - " fig.update_layout(\n", - " title=title,\n", - " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", - " showlegend=True,\n", - " )\n", - " fig.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bDaa2Mtsifmq" - }, - "source": [ - "## Build CrewAI agent\n", - "\n", - "Build your application using CrewAI, including the Gemini model and custom tools that you define." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KHwShhpOitKp" - }, - "source": [ - "### Set tools\n", - "\n", - "To start, set the tools that a customer support agent needs to do their job." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gA2ZKvfeislw" - }, - "outputs": [], - "source": [ - "@tool\n", - "def get_product_details(product_name: str):\n", - " \"\"\"Gathers basic details about a product.\"\"\"\n", - " details = {\n", - " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", - " \"usb charger\": \"A super fast and light usb charger\",\n", - " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", - " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", - " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", - " }\n", - " return details.get(product_name, \"Product details not found.\")\n", - "\n", - "\n", - "@tool\n", - "def get_product_price(product_name: str):\n", - " \"\"\"Gathers price about a product.\"\"\"\n", - " details = {\n", - " \"smartphone\": 500,\n", - " \"usb charger\": 10,\n", - " \"shoes\": 100,\n", - " \"headphones\": 50,\n", - " \"speaker\": 80,\n", - " }\n", - " return details.get(product_name, \"Product price not found.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "be70714d9fae" - }, - "source": [ - "### Define router using Flow\n", - "\n", - "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "516b5108d327" - }, - "outputs": [], - "source": [ - "class ProductFlow(Flow):\n", - " @start\n", - " def begin_flow(self):\n", - " \"\"\"Starts the product information flow\"\"\"\n", - " return \"check_request\"\n", - "\n", - " @listen(\"check_request\")\n", - " def router(self, state: dict) -> str:\n", - " \"\"\"Routes the product request to appropriate handler\"\"\"\n", - " # Get the last message from the state\n", - " last_message = state.get(\"last_message\", {})\n", - " tool_calls = last_message.get(\"tool_calls\", [])\n", - "\n", - " if tool_calls:\n", - " function_name = tool_calls[0].get(\"name\")\n", - " if function_name == \"get_product_price\":\n", - " return \"get_product_price\"\n", - " else:\n", - " return \"get_product_details\"\n", - " return \"end\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "l4mk5XPui4Y1" - }, - "source": [ - "### Set the model\n", - "\n", - "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "BaYeo6K2i-w1" - }, - "outputs": [], - "source": [ - "model = \"vertex_ai/gemini-1.5-pro-002\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tNlAY9cojEWz" - }, - "source": [ - "### Assemble the agent\n", - "\n", - "The Vertex AI Gen AI Evaluation works directly with 'Queryable' agents, and also lets you add your own custom functions with a specific structure (signature).\n", - "\n", - "In this case, you assemble the agent using a custom function. The function triggers the agent for a given input and parse the agent outcome to extract the response and called tools." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dAFdi7SujGP8" - }, - "outputs": [], - "source": [ - "def agent_parsed_outcome(input):\n", - " product_researcher = Agent(\n", - " role=\"Product Researcher\",\n", - " goal=\"Research product details and prices accurately\",\n", - " backstory=\"Expert at gathering and analyzing product information\",\n", - " llm=model,\n", - " tools=[get_product_details, get_product_price],\n", - " allow_delegation=False,\n", - " )\n", - "\n", - " # Create task based on the input\n", - " research_task = Task(\n", - " description=f\"Analyze this user request: '{input}'. \"\n", - " f\"If the request is about price, use get_product_price tool. \"\n", - " f\"Otherwise, use get_product_details tool to get product information.\",\n", - " expected_output=\"Product information including details and/or price based on the user request.\",\n", - " agent=product_researcher,\n", - " )\n", - "\n", - " # Create crew with sequential process\n", - " crew = Crew(\n", - " agents=[product_researcher],\n", - " tasks=[research_task],\n", - " process=Process.sequential,\n", - " )\n", - "\n", - " result = crew.kickoff()\n", - " return parse_crewai_output_to_dictionary(crew, result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_HGcs6PVjRj_" - }, - "source": [ - "### Test the agent\n", - "\n", - "Query your agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "lGb58OJkjUs9" - }, - "outputs": [], - "source": [ - "response = agent_parsed_outcome(input=\"Get product details for shoes\")\n", - "display(Markdown(format_output_as_markdown(response)))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2wCFstt8w4Dx" - }, - "outputs": [], - "source": [ - "response = agent_parsed_outcome(input=\"Get product price for shoes\")\n", - "display(Markdown(format_output_as_markdown(response)))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aOGPePsorpUl" - }, - "source": [ - "## Evaluating a CrewAI agent with Vertex AI Gen AI Evaluation\n", - "\n", - "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", - "\n", - "Monitoring focuses on how well your agent is performing specific tasks:\n", - "\n", - "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", - "\n", - "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", - "\n", - "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", - "\n", - "Observability is about understanding the overall health of the agent:\n", - "\n", - "* **Latency**: How long does it take the agent to respond?\n", - "\n", - "* **Failure Rate**: How often does the agent fail to produce a response?\n", - "\n", - "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e43229f3ad4f" - }, - "source": [ - "### Prepare Agent Evaluation dataset\n", - "\n", - "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", - "\n", - "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", - "\n", - "> Optionally, you can provide both generated responses and predicted trajectory (**Bring-Your-Own-Dataset scenario**).\n", - "\n", - "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fFf8uTdUiDt3" - }, - "outputs": [], - "source": [ - "eval_data = {\n", - " \"prompt\": [\n", - " \"Get price for smartphone\",\n", - " \"Get product details and price for headphones\",\n", - " \"Get details for usb charger\",\n", - " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\",\n", - " ],\n", - " \"reference_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - "}\n", - "\n", - "eval_sample_dataset = pd.DataFrame(eval_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PQEI1EcfvFHb" - }, - "source": [ - "Print some samples from the dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "EjsonqWWvIvE" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(eval_sample_dataset, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "m4CvBuf1afHG" - }, - "source": [ - "### Single tool usage evaluation\n", - "\n", - "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_rS5GGKHd5bx" - }, - "source": [ - "#### Set single tool usage metrics\n", - "\n", - "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", - "\n", - "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xixvq8dwd5by" - }, - "outputs": [], - "source": [ - "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ktKZoT2Qd5by" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using agent_parsed_outcome function and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SRv43fDcd5by" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"\n", - "\n", - "single_tool_call_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=single_tool_usage_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", - " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(single_tool_call_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6o5BjSTFKVMS" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Use some helper functions to visualize a sample of evaluation result." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1Jopzw83k14w" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JlujdJpu5Kn6" - }, - "source": [ - "### Trajectory Evaluation\n", - "\n", - "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8s-nHdDJneHM" - }, - "source": [ - "#### Set trajectory metrics\n", - "\n", - "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", - "\n", - "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", - "\n", - "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", - "\n", - "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", - "\n", - "* `trajectory_precision`: proportion of predicted actions present in reference\n", - "\n", - "* `trajectory_recall`: proportion of reference actions present in predicted. \n", - "\n", - "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "c32WIS95neHN" - }, - "outputs": [], - "source": [ - "trajectory_metrics = [\n", - " \"trajectory_exact_match\",\n", - " \"trajectory_in_order_match\",\n", - " \"trajectory_any_order_match\",\n", - " \"trajectory_precision\",\n", - " \"trajectory_recall\",\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF3jhTH3neHN" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vOdS7TJUneHN" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"trajectory-{get_id()}\"\n", - "\n", - "trajectory_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", - ")\n", - "\n", - "trajectory_eval_result = trajectory_eval_task.evaluate(\n", - " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(trajectory_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DBiUI3LyLBtj" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Print and visualize a sample of evaluation results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "z7-LdM3mLBtk" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sLVRdN5llA0h" - }, - "outputs": [], - "source": [ - "plot_bar_plot(\n", - " trajectory_eval_result,\n", - " title=\"Trajectory Metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "T8TipU2akHEd" - }, - "source": [ - "### Evaluate final response\n", - "\n", - "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DeK-py7ykkDN" - }, - "source": [ - "#### Set response metrics\n", - "\n", - "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", - "\n", - "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cyGHGgeVklvz" - }, - "outputs": [], - "source": [ - "response_metrics = [\"safety\", \"coherence\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DaBJWcg1kn55" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wRb2EC_hknSD" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"response-{get_id()}\"\n", - "\n", - "response_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", - ")\n", - "\n", - "response_eval_result = response_eval_task.evaluate(\n", - " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(response_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JtewTwiwg9qH" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "\n", - "Print new evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZODTRuq2lF75" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ntRBK3Te6PEc" - }, - "source": [ - "### Evaluate generated response conditioned by tool choosing\n", - "\n", - "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", - "\n", - "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4bENwFcd6prX" - }, - "source": [ - "#### Define a custom metric\n", - "\n", - "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", - "\n", - "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "txGEHcg76riI" - }, - "outputs": [], - "source": [ - "criteria = {\n", - " \"Follows trajectory\": (\n", - " \"Evaluate whether the agent's response logically follows from the \"\n", - " \"sequence of actions it took. Consider these sub-points:\\n\"\n", - " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", - " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", - " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", - " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", - " )\n", - "}\n", - "\n", - "pointwise_rating_rubric = {\n", - " \"1\": \"Follows trajectory\",\n", - " \"0\": \"Does not follow trajectory\",\n", - "}\n", - "\n", - "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", - " criteria=criteria,\n", - " rating_rubric=pointwise_rating_rubric,\n", - " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8MJqXu0kikxd" - }, - "source": [ - "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5EL7iEDMikNQ" - }, - "outputs": [], - "source": [ - "print(response_follows_trajectory_prompt_template.prompt_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e1djVp7Fi4Yy" - }, - "source": [ - "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nx1xbZD87iMj" - }, - "outputs": [], - "source": [ - "response_follows_trajectory_metric = PointwiseMetric(\n", - " metric=\"response_follows_trajectory\",\n", - " metric_prompt_template=response_follows_trajectory_prompt_template,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1pmxLwTe7Ywv" - }, - "source": [ - "#### Set response metrics\n", - "\n", - "Set new generated response evaluation metrics by including the custom metric.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wrsbVFDd7Ywv" - }, - "outputs": [], - "source": [ - "response_tool_metrics = [\n", - " \"trajectory_exact_match\",\n", - " \"trajectory_in_order_match\",\n", - " \"safety\",\n", - " response_follows_trajectory_metric,\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Lo-Sza807Ywv" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "Run a new agent's evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_dkb4gSn7Ywv" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"response-over-tools-{get_id()}\"\n", - "\n", - "response_eval_tool_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "response_eval_tool_result = response_eval_tool_task.evaluate(\n", - " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(response_eval_tool_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AtOfIFi2j88g" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Visualize evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GH2YvXgLlLH7" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(response_eval_tool_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tdVhCURXMdLG" - }, - "outputs": [], - "source": [ - "plot_bar_plot(\n", - " response_eval_tool_result,\n", - " title=\"Response Metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4nuUDP3a2eTB" - }, - "source": [ - "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", - "\n", - "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DRLKlmWd27PK" - }, - "source": [ - "### Bring your own evaluation dataset\n", - "\n", - "Define the evaluation dataset with the predicted trajectory and the generated response." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "y9hBgsg324Ej" - }, - "outputs": [], - "source": [ - "byod_eval_data = {\n", - " \"prompt\": [\n", - " \"Get price for smartphone\",\n", - " \"Get product details and price for headphones\",\n", - " \"Get details for usb charger\",\n", - " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\",\n", - " ],\n", - " \"reference_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - " \"predicted_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - " \"response\": [\n", - " 500,\n", - " 50,\n", - " \"A super fast and light usb charger\",\n", - " 100,\n", - " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", - " ],\n", - "}\n", - "\n", - "byod_eval_sample_dataset = pd.DataFrame(eval_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oEYmU2eJ7q-1" - }, - "source": [ - "### Run an evaluation task\n", - "\n", - "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wBD-4wpB7q-3" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", - "\n", - "byod_response_eval_tool_task = EvalTask(\n", - " dataset=byod_eval_sample_dataset,\n", - " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(experiment_run_name=EXPERIMENT_RUN_NAME)\n", - "\n", - "display_eval_report(byod_response_eval_tool_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9eU3LG6r7q-3" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Visualize evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pQFzmd2I7q-3" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "84HiPDOkPseW" - }, - "outputs": [], - "source": [ - "display_radar_plot(\n", - " byod_response_eval_tool_result,\n", - " title=\"Agent evaluation metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2a4e033321ad" - }, - "source": [ - "## Cleaning up\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Ox2I3UfRlTOd" - }, - "outputs": [], - "source": [ - "delete_experiment = True\n", - "\n", - "if delete_experiment:\n", - " try:\n", - " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", - " experiment.delete(delete_backing_tensorboard_runs=True)\n", - " except Exception as e:\n", - " print(e)" - ] - } - ], - "metadata": { - "colab": { - "name": "evaluating_crewai_agent.ipynb", - "toc_visible": true - }, - "environment": { - "kernel": "python3", - "name": "common-cpu.m126", - "type": "gcloud", - "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/base-cpu:m126" - }, - "kernelspec": { - "display_name": "Python 3 (Local)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 4 + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Evaluating Agents - Evaluate a CrewAI agent with Vertex AI Gen AI Evaluation\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Google
Open in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Open in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "Share to:\n", + "\n", + "\n", + " \"LinkedIn\n", + "\n", + "\n", + "\n", + " \"Bluesky\n", + "\n", + "\n", + "\n", + " \"X\n", + "\n", + "\n", + "\n", + " \"Reddit\n", + "\n", + "\n", + "\n", + " \"Facebook\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "84f0f73a0f76" + }, + "source": [ + "| | |\n", + "|-|-|\n", + "| Authors | [Ivan Nardini](https://github.com/inardini), [Naveksha Sood](https://github.com/navekshasood) |" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", + "\n", + "Vertex AI Model Evaluation provides a toolkit of quality controlled and explainable methods and metrics to evaluate any generative model or application, including agents, and benchmark the evaluation results against your own judgment, using your own evaluation criteria.\n", + "\n", + "This tutorial shows how to evaluate a Crew AI agent using Vertex AI Gen AI Evaluation for agent evaluation.\n", + "\n", + "The tutorial uses the following Google Cloud services and resources:\n", + "\n", + "* Vertex AI Gen AI Evaluation\n", + "\n", + "The steps performed include:\n", + "\n", + "* Build local agent using Crew AI\n", + "* Prepare Agent Evaluation dataset\n", + "* Single tool usage evaluation\n", + "* Trajectory evaluation\n", + "* Response evaluation\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "61RBz8LLbxCR" + }, + "source": [ + "## Get started" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "No17Cw5hgx12" + }, + "source": [ + "### Install Vertex AI SDK and other required packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tFy3H3aPgx12" + }, + "outputs": [], + "source": [ + "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation]\" \\\n", + " \"crewai\" \"crewai-tools\" \\\n", + " \"cloudpickle==3.0.0\" \\\n", + " \"pydantic==2.7.4\" \\\n", + " \"requests\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R5Xep4W9lq-Z" + }, + "source": [ + "### Restart runtime\n", + "\n", + "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", + "\n", + "The restart might take a minute or longer. After it's restarted, continue to the next step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XRvKdaPDTznN" + }, + "outputs": [], + "source": [ + "import IPython\n", + "\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SbmM4z7FOBpM" + }, + "source": [ + "
\n", + "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dmWOrTJ3gx13" + }, + "source": [ + "### Authenticate your notebook environment (Colab only)\n", + "\n", + "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NyKGtVQjgx13" + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " from google.colab import auth\n", + "\n", + " auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF4l8DTdWgPY" + }, + "source": [ + "### Set Google Cloud project information and initialize Vertex AI SDK\n", + "\n", + "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nqwi-5ufWp_B" + }, + "outputs": [], + "source": [ + "# Use the environment variable if the user doesn't provide Project ID.\n", + "import os\n", + "\n", + "import vertexai\n", + "\n", + "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", + "\n", + "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", + " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", + "os.environ[\"GOOGLE_CLOUD_PROJECT\"] = PROJECT_ID\n", + "\n", + "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", + "\n", + "EXPERIMENT_NAME = \"evaluate-crewai-agent\" # @param {type:\"string\"}\n", + "\n", + "vertexai.init(project=PROJECT_ID, location=LOCATION, experiment=EXPERIMENT_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5303c05f7aa6" + }, + "source": [ + "## Import libraries\n", + "\n", + "Import tutorial libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6fc324893334" + }, + "outputs": [], + "source": [ + "# General\n", + "import random\n", + "import string\n", + "import warnings\n", + "\n", + "from IPython.display import HTML, Markdown, display\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "\n", + "warnings.filterwarnings(\"ignore\", category=Warning, module=\"opentelemetry.trace\")\n", + "\n", + "# Build agent\n", + "from crewai import Agent, Crew, Process, Task\n", + "from crewai.flow.flow import Flow, listen, start\n", + "from crewai_tools import tool\n", + "\n", + "# Evaluate agent\n", + "from google.cloud import aiplatform\n", + "from vertexai.preview.evaluation import EvalTask\n", + "from vertexai.preview.evaluation.metrics import (\n", + " PointwiseMetric,\n", + " PointwiseMetricPromptTemplate,\n", + " TrajectorySingleToolUse,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MVnBDX54gz7j" + }, + "source": [ + "## Define helper functions\n", + "\n", + "Initiate a set of helper functions to print tutorial results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uSgWjMD_g1_v" + }, + "outputs": [], + "source": [ + "def get_id(length: int = 8) -> str:\n", + " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", + " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", + "\n", + "\n", + "def parse_crewai_output_to_dictionary(crew, crew_output):\n", + " \"\"\"\n", + " Parse CrewAI output into a structured dictionary format.\n", + " \"\"\"\n", + " final_output = {\"response\": str(crew_output), \"predicted_trajectory\": []}\n", + "\n", + " for agent in crew.agents:\n", + " try:\n", + " for tool_result in agent.tools_results:\n", + " tool_info = {\n", + " \"tool_name\": tool_result.get(\"tool_name\", \"\"),\n", + " \"tool_input\": tool_result.get(\"tool_args\", {}),\n", + " }\n", + " final_output[\"predicted_trajectory\"].append(tool_info)\n", + " except AttributeError as e:\n", + " final_output[\"error\"] = f\"Agent does not have tools_results: {str(e)}\"\n", + " print(f\"Error: {e}\")\n", + "\n", + " return final_output\n", + "\n", + "\n", + "def format_output_as_markdown(output: dict) -> str:\n", + " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", + " markdown = \"### AI Response\\n\"\n", + " markdown += f\"{output['response']}\\n\\n\"\n", + "\n", + " if output[\"predicted_trajectory\"]:\n", + " markdown += \"### Function Calls\\n\"\n", + " for call in output[\"predicted_trajectory\"]:\n", + " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", + " markdown += \" - **Arguments**:\\n\"\n", + " for key, value in call[\"tool_input\"].items():\n", + " markdown += f\" - `{key}`: `{value}`\\n\"\n", + "\n", + " return markdown\n", + "\n", + "\n", + "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", + " \"\"\"Display the evaluation results.\"\"\"\n", + " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", + " display(Markdown(\"### Summary Metrics\"))\n", + " display(metrics_df)\n", + "\n", + " display(Markdown(f\"### Row-wise Metrics\"))\n", + " display(eval_result.metrics_table)\n", + "\n", + "\n", + "def display_drilldown(row: pd.Series) -> None:\n", + " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", + "\n", + " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", + "\n", + " if not (\n", + " isinstance(row[\"predicted_trajectory\"], list)\n", + " and isinstance(row[\"reference_trajectory\"], list)\n", + " ):\n", + " return\n", + "\n", + " for predicted_trajectory, reference_trajectory in zip(\n", + " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", + " ):\n", + " display(\n", + " HTML(\n", + " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", + " )\n", + " )\n", + "\n", + " if not (\n", + " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", + " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", + " ):\n", + " continue\n", + "\n", + " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", + " print(\"Tool Input Key: \", tool_input_key)\n", + "\n", + " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " reference_trajectory[\"tool_input\"][tool_input_key],\n", + " )\n", + " else:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " \"N/A\",\n", + " )\n", + " print(\"\\n\")\n", + " display(HTML(\"
\"))\n", + "\n", + "\n", + "def display_dataframe_rows(\n", + " df: pd.DataFrame,\n", + " columns: list[str] | None = None,\n", + " num_rows: int = 3,\n", + " display_drilldown: bool = False,\n", + ") -> None:\n", + " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", + "\n", + " if columns:\n", + " df = df[columns]\n", + "\n", + " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", + " header_style = base_style + \"font-weight: bold;\"\n", + "\n", + " for _, row in df.head(num_rows).iterrows():\n", + " for column in df.columns:\n", + " display(\n", + " HTML(\n", + " f\"{column.replace('_', ' ').title()}: \"\n", + " )\n", + " )\n", + " display(HTML(f\"{row[column]}
\"))\n", + "\n", + " display(HTML(\"
\"))\n", + "\n", + " if (\n", + " display_drilldown\n", + " and \"predicted_trajectory\" in df.columns\n", + " and \"reference_trajectory\" in df.columns\n", + " ):\n", + " display_drilldown(row)\n", + "\n", + "\n", + "def plot_bar_plot(\n", + " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", + ") -> None:\n", + " fig = go.Figure()\n", + " data = []\n", + "\n", + " summary_metrics = eval_result.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " data.append(\n", + " go.Bar(\n", + " x=list(summary_metrics.keys()),\n", + " y=list(summary_metrics.values()),\n", + " name=title,\n", + " )\n", + " )\n", + "\n", + " fig = go.Figure(data=data)\n", + "\n", + " # Change the bar mode\n", + " fig.update_layout(barmode=\"group\")\n", + " fig.show()\n", + "\n", + "\n", + "def display_radar_plot(eval_results, title: str, metrics=None):\n", + " \"\"\"Plot the radar plot.\"\"\"\n", + " fig = go.Figure()\n", + " summary_metrics = eval_results.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " min_val = min(summary_metrics.values())\n", + " max_val = max(summary_metrics.values())\n", + "\n", + " fig.add_trace(\n", + " go.Scatterpolar(\n", + " r=list(summary_metrics.values()),\n", + " theta=list(summary_metrics.keys()),\n", + " fill=\"toself\",\n", + " name=title,\n", + " )\n", + " )\n", + " fig.update_layout(\n", + " title=title,\n", + " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", + " showlegend=True,\n", + " )\n", + " fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bDaa2Mtsifmq" + }, + "source": [ + "## Build CrewAI agent\n", + "\n", + "Build your application using CrewAI, including the Gemini model and custom tools that you define." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KHwShhpOitKp" + }, + "source": [ + "### Set tools\n", + "\n", + "To start, set the tools that a customer support agent needs to do their job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gA2ZKvfeislw" + }, + "outputs": [], + "source": [ + "@tool\n", + "def get_product_details(product_name: str):\n", + " \"\"\"Gathers basic details about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", + " \"usb charger\": \"A super fast and light usb charger\",\n", + " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", + " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", + " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " }\n", + " return details.get(product_name, \"Product details not found.\")\n", + "\n", + "\n", + "@tool\n", + "def get_product_price(product_name: str):\n", + " \"\"\"Gathers price about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": 500,\n", + " \"usb charger\": 10,\n", + " \"shoes\": 100,\n", + " \"headphones\": 50,\n", + " \"speaker\": 80,\n", + " }\n", + " return details.get(product_name, \"Product price not found.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "be70714d9fae" + }, + "source": [ + "### Define router using Flow\n", + "\n", + "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "516b5108d327" + }, + "outputs": [], + "source": [ + "class ProductFlow(Flow):\n", + " @start\n", + " def begin_flow(self):\n", + " \"\"\"Starts the product information flow\"\"\"\n", + " return \"check_request\"\n", + "\n", + " @listen(\"check_request\")\n", + " def router(self, state: dict) -> str:\n", + " \"\"\"Routes the product request to appropriate handler\"\"\"\n", + " # Get the last message from the state\n", + " last_message = state.get(\"last_message\", {})\n", + " tool_calls = last_message.get(\"tool_calls\", [])\n", + "\n", + " if tool_calls:\n", + " function_name = tool_calls[0].get(\"name\")\n", + " if function_name == \"get_product_price\":\n", + " return \"get_product_price\"\n", + " else:\n", + " return \"get_product_details\"\n", + " return \"end\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l4mk5XPui4Y1" + }, + "source": [ + "### Set the model\n", + "\n", + "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BaYeo6K2i-w1" + }, + "outputs": [], + "source": [ + "model = \"vertex_ai/gemini-1.5-pro-002\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNlAY9cojEWz" + }, + "source": [ + "### Assemble the agent\n", + "\n", + "The Vertex AI Gen AI Evaluation works directly with 'Queryable' agents, and also lets you add your own custom functions with a specific structure (signature).\n", + "\n", + "In this case, you assemble the agent using a custom function. The function triggers the agent for a given input and parse the agent outcome to extract the response and called tools." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dAFdi7SujGP8" + }, + "outputs": [], + "source": [ + "def agent_parsed_outcome(input):\n", + " product_researcher = Agent(\n", + " role=\"Product Researcher\",\n", + " goal=\"Research product details and prices accurately\",\n", + " backstory=\"Expert at gathering and analyzing product information\",\n", + " llm=model,\n", + " tools=[get_product_details, get_product_price],\n", + " allow_delegation=False,\n", + " )\n", + "\n", + " # Create task based on the input\n", + " research_task = Task(\n", + " description=f\"Analyze this user request: '{input}'. \"\n", + " f\"If the request is about price, use get_product_price tool. \"\n", + " f\"Otherwise, use get_product_details tool to get product information.\",\n", + " expected_output=\"Product information including details and/or price based on the user request.\",\n", + " agent=product_researcher,\n", + " )\n", + "\n", + " # Create crew with sequential process\n", + " crew = Crew(\n", + " agents=[product_researcher],\n", + " tasks=[research_task],\n", + " process=Process.sequential,\n", + " )\n", + "\n", + " result = crew.kickoff()\n", + " return parse_crewai_output_to_dictionary(crew, result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_HGcs6PVjRj_" + }, + "source": [ + "### Test the agent\n", + "\n", + "Query your agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lGb58OJkjUs9" + }, + "outputs": [], + "source": [ + "response = agent_parsed_outcome(input=\"Get product details for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2wCFstt8w4Dx" + }, + "outputs": [], + "source": [ + "response = agent_parsed_outcome(input=\"Get product price for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aOGPePsorpUl" + }, + "source": [ + "## Evaluating a CrewAI agent with Vertex AI Gen AI Evaluation\n", + "\n", + "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", + "\n", + "Monitoring focuses on how well your agent is performing specific tasks:\n", + "\n", + "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", + "\n", + "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", + "\n", + "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", + "\n", + "Observability is about understanding the overall health of the agent:\n", + "\n", + "* **Latency**: How long does it take the agent to respond?\n", + "\n", + "* **Failure Rate**: How often does the agent fail to produce a response?\n", + "\n", + "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e43229f3ad4f" + }, + "source": [ + "### Prepare Agent Evaluation dataset\n", + "\n", + "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", + "\n", + "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", + "\n", + "> Optionally, you can provide both generated responses and predicted trajectory (**Bring-Your-Own-Dataset scenario**).\n", + "\n", + "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fFf8uTdUiDt3" + }, + "outputs": [], + "source": [ + "eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + "}\n", + "\n", + "eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PQEI1EcfvFHb" + }, + "source": [ + "Print some samples from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EjsonqWWvIvE" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(eval_sample_dataset, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m4CvBuf1afHG" + }, + "source": [ + "### Single tool usage evaluation\n", + "\n", + "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_rS5GGKHd5bx" + }, + "source": [ + "#### Set single tool usage metrics\n", + "\n", + "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", + "\n", + "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xixvq8dwd5by" + }, + "outputs": [], + "source": [ + "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ktKZoT2Qd5by" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using agent_parsed_outcome function and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SRv43fDcd5by" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"\n", + "\n", + "single_tool_call_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=single_tool_usage_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(single_tool_call_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6o5BjSTFKVMS" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Use some helper functions to visualize a sample of evaluation result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1Jopzw83k14w" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JlujdJpu5Kn6" + }, + "source": [ + "### Trajectory Evaluation\n", + "\n", + "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8s-nHdDJneHM" + }, + "source": [ + "#### Set trajectory metrics\n", + "\n", + "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", + "\n", + "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", + "\n", + "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", + "\n", + "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", + "\n", + "* `trajectory_precision`: proportion of predicted actions present in reference\n", + "\n", + "* `trajectory_recall`: proportion of reference actions present in predicted. \n", + "\n", + "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c32WIS95neHN" + }, + "outputs": [], + "source": [ + "trajectory_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"trajectory_any_order_match\",\n", + " \"trajectory_precision\",\n", + " \"trajectory_recall\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF3jhTH3neHN" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vOdS7TJUneHN" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"trajectory-{get_id()}\"\n", + "\n", + "trajectory_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "trajectory_eval_result = trajectory_eval_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(trajectory_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DBiUI3LyLBtj" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Print and visualize a sample of evaluation results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "z7-LdM3mLBtk" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sLVRdN5llA0h" + }, + "outputs": [], + "source": [ + "plot_bar_plot(\n", + " trajectory_eval_result,\n", + " title=\"Trajectory Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T8TipU2akHEd" + }, + "source": [ + "### Evaluate final response\n", + "\n", + "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DeK-py7ykkDN" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", + "\n", + "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cyGHGgeVklvz" + }, + "outputs": [], + "source": [ + "response_metrics = [\"safety\", \"coherence\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DaBJWcg1kn55" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wRb2EC_hknSD" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"response-{get_id()}\"\n", + "\n", + "response_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "response_eval_result = response_eval_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(response_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JtewTwiwg9qH" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZODTRuq2lF75" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ntRBK3Te6PEc" + }, + "source": [ + "### Evaluate generated response conditioned by tool choosing\n", + "\n", + "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", + "\n", + "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4bENwFcd6prX" + }, + "source": [ + "#### Define a custom metric\n", + "\n", + "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", + "\n", + "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "txGEHcg76riI" + }, + "outputs": [], + "source": [ + "criteria = {\n", + " \"Follows trajectory\": (\n", + " \"Evaluate whether the agent's response logically follows from the \"\n", + " \"sequence of actions it took. Consider these sub-points:\\n\"\n", + " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", + " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", + " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", + " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", + " )\n", + "}\n", + "\n", + "pointwise_rating_rubric = {\n", + " \"1\": \"Follows trajectory\",\n", + " \"0\": \"Does not follow trajectory\",\n", + "}\n", + "\n", + "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", + " criteria=criteria,\n", + " rating_rubric=pointwise_rating_rubric,\n", + " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8MJqXu0kikxd" + }, + "source": [ + "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5EL7iEDMikNQ" + }, + "outputs": [], + "source": [ + "print(response_follows_trajectory_prompt_template.prompt_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e1djVp7Fi4Yy" + }, + "source": [ + "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nx1xbZD87iMj" + }, + "outputs": [], + "source": [ + "response_follows_trajectory_metric = PointwiseMetric(\n", + " metric=\"response_follows_trajectory\",\n", + " metric_prompt_template=response_follows_trajectory_prompt_template,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1pmxLwTe7Ywv" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "Set new generated response evaluation metrics by including the custom metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wrsbVFDd7Ywv" + }, + "outputs": [], + "source": [ + "response_tool_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"safety\",\n", + " response_follows_trajectory_metric,\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lo-Sza807Ywv" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_dkb4gSn7Ywv" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"response-over-tools-{get_id()}\"\n", + "\n", + "response_eval_tool_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "response_eval_tool_result = response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AtOfIFi2j88g" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GH2YvXgLlLH7" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tdVhCURXMdLG" + }, + "outputs": [], + "source": [ + "plot_bar_plot(\n", + " response_eval_tool_result,\n", + " title=\"Response Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4nuUDP3a2eTB" + }, + "source": [ + "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", + "\n", + "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DRLKlmWd27PK" + }, + "source": [ + "### Bring your own evaluation dataset\n", + "\n", + "Define the evaluation dataset with the predicted trajectory and the generated response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y9hBgsg324Ej" + }, + "outputs": [], + "source": [ + "byod_eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"predicted_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"response\": [\n", + " 500,\n", + " 50,\n", + " \"A super fast and light usb charger\",\n", + " 100,\n", + " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " ],\n", + "}\n", + "\n", + "byod_eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oEYmU2eJ7q-1" + }, + "source": [ + "### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wBD-4wpB7q-3" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", + "\n", + "byod_response_eval_tool_task = EvalTask(\n", + " dataset=byod_eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", + " experiment_run_name=EXPERIMENT_RUN_NAME\n", + ")\n", + "\n", + "display_eval_report(byod_response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9eU3LG6r7q-3" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pQFzmd2I7q-3" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "84HiPDOkPseW" + }, + "outputs": [], + "source": [ + "display_radar_plot(\n", + " byod_response_eval_tool_result,\n", + " title=\"Agent evaluation metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2a4e033321ad" + }, + "source": [ + "## Cleaning up\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ox2I3UfRlTOd" + }, + "outputs": [], + "source": [ + "delete_experiment = True\n", + "\n", + "if delete_experiment:\n", + " try:\n", + " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", + " experiment.delete(delete_backing_tensorboard_runs=True)\n", + " except Exception as e:\n", + " print(e)" + ] + } + ], + "metadata": { + "colab": { + "name": "evaluating_crewai_agent.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/gemini/evaluation/evaluating_langgraph_agent.ipynb b/gemini/evaluation/evaluating_langgraph_agent.ipynb index bc2a922f6d..227f09a035 100644 --- a/gemini/evaluation/evaluating_langgraph_agent.ipynb +++ b/gemini/evaluation/evaluating_langgraph_agent.ipynb @@ -1,1578 +1,1561 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ur8xi4C7S06n" - }, - "outputs": [], - "source": [ - "# Copyright 2024 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JAPoU8Sm5E6e" - }, - "source": [ - "# Evaluating Agents - Evaluate a LangGraph agent with Vertex AI Gen AI Evaluation\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \"Google
Open in Colab\n", - "
\n", - "
\n", - " \n", - " \"Google
Open in Colab Enterprise\n", - "
\n", - "
\n", - " \n", - " \"Vertex
Open in Vertex AI Workbench\n", - "
\n", - "
\n", - " \n", - " \"GitHub
View on GitHub\n", - "
\n", - "
\n", - "\n", - "
\n", - "\n", - "Share to:\n", - "\n", - "\n", - " \"LinkedIn\n", - "\n", - "\n", - "\n", - " \"Bluesky\n", - "\n", - "\n", - "\n", - " \"X\n", - "\n", - "\n", - "\n", - " \"Reddit\n", - "\n", - "\n", - "\n", - " \"Facebook\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "84f0f73a0f76" - }, - "source": [ - "| | |\n", - "|-|-|\n", - "| Authors | [Ivan Nardini](https://github.com/inardini) [Naveksha Sood](https://github.com/navekshasood)|" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tvgnzT1CKxrO" - }, - "source": [ - "## Overview\n", - "\n", - "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", - "\n", - "Vertex AI Model Evaluation provides a toolkit of quality controlled and explainable methods and metrics to evaluate any generative model or application, including agents, and benchmark the evaluation results against your own judgment, using your own evaluation criteria.\n", - "\n", - "This tutorial shows how to evaluate a LangGraph agent using Vertex AI Gen AI Evaluation for agent evaluation.\n", - "\n", - "The tutorial uses the following Google Cloud services and resources:\n", - "\n", - "* Vertex AI Gen AI Evaluation\n", - "\n", - "The steps performed include:\n", - "\n", - "* Build local agent using LangGraph\n", - "* Prepare Agent Evaluation dataset\n", - "* Single tool usage evaluation\n", - "* Trajectory evaluation\n", - "* Response evaluation\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "61RBz8LLbxCR" - }, - "source": [ - "## Get started" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "No17Cw5hgx12" - }, - "source": [ - "### Install Vertex AI SDK and other required packages\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tFy3H3aPgx12" - }, - "outputs": [], - "source": [ - "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation]\" \\\n", - " \"langchain_google_vertexai\" \\\n", - " \"langgraph\" \\\n", - " \"cloudpickle==3.0.0\" \\\n", - " \"pydantic==2.7.4\" \\\n", - " \"requests\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "R5Xep4W9lq-Z" - }, - "source": [ - "### Restart runtime\n", - "\n", - "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", - "\n", - "The restart might take a minute or longer. After it's restarted, continue to the next step." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XRvKdaPDTznN" - }, - "outputs": [], - "source": [ - "import IPython\n", - "\n", - "app = IPython.Application.instance()\n", - "app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SbmM4z7FOBpM" - }, - "source": [ - "
\n", - "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", - "
\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dmWOrTJ3gx13" - }, - "source": [ - "### Authenticate your notebook environment (Colab only)\n", - "\n", - "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NyKGtVQjgx13" - }, - "outputs": [], - "source": [ - "import sys\n", - "\n", - "if \"google.colab\" in sys.modules:\n", - " from google.colab import auth\n", - "\n", - " auth.authenticate_user()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF4l8DTdWgPY" - }, - "source": [ - "### Set Google Cloud project information and initialize Vertex AI SDK\n", - "\n", - "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", - "\n", - "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nqwi-5ufWp_B" - }, - "outputs": [], - "source": [ - "# Use the environment variable if the user doesn't provide Project ID.\n", - "import os\n", - "\n", - "import vertexai\n", - "\n", - "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", - "\n", - "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", - " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", - "\n", - "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", - "\n", - "EXPERIMENT_NAME = \"evaluate-langgraph-agent\" # @param {type:\"string\"}\n", - "\n", - "vertexai.init(project=PROJECT_ID, location=LOCATION, experiment=EXPERIMENT_NAME)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5303c05f7aa6" - }, - "source": [ - "## Import libraries\n", - "\n", - "Import tutorial libraries." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6fc324893334" - }, - "outputs": [], - "source": [ - "# General\n", - "import random\n", - "import string\n", - "from typing import Literal\n", - "\n", - "from IPython.display import HTML, Markdown, display\n", - "\n", - "# Evaluate agent\n", - "from google.cloud import aiplatform\n", - "from langchain.load import dump as langchain_load_dump\n", - "\n", - "# Build agent\n", - "from langchain_core.messages import BaseMessage, HumanMessage\n", - "from langchain_core.tools import tool\n", - "from langchain_google_vertexai import ChatVertexAI\n", - "from langgraph.graph import END, MessageGraph\n", - "from langgraph.prebuilt import ToolNode\n", - "import pandas as pd\n", - "import plotly.graph_objects as go\n", - "from vertexai.preview.evaluation import EvalTask\n", - "from vertexai.preview.evaluation.metrics import (\n", - " PointwiseMetric,\n", - " PointwiseMetricPromptTemplate,\n", - " TrajectorySingleToolUse,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MVnBDX54gz7j" - }, - "source": [ - "## Define helper functions\n", - "\n", - "Initiate a set of helper functions to print tutorial results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uSgWjMD_g1_v" - }, - "outputs": [], - "source": [ - "def get_id(length: int = 8) -> str:\n", - " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", - " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", - "\n", - "\n", - "def parse_messages_to_output_dictionary(messages: list[dict]) -> dict:\n", - " \"\"\"Parse response and function calls from a list of messages in the constructor format.\"\"\"\n", - "\n", - " final_output = {\n", - " \"response\": \"No AI response found in the message history.\",\n", - " \"predicted_trajectory\": [],\n", - " }\n", - "\n", - " # Process each message\n", - " function_calls = []\n", - " for message in messages:\n", - " # Check if it's a Tool message which contains the actual response\n", - " if message.get(\"type\") == \"constructor\" and \"ToolMessage\" in message.get(\n", - " \"id\", []\n", - " ):\n", - " final_output[\"response\"] = message[\"kwargs\"][\"content\"]\n", - "\n", - " # Check if it's an AI message to get tool calls\n", - " elif message.get(\"type\") == \"constructor\" and \"AIMessage\" in message.get(\n", - " \"id\", []\n", - " ):\n", - " tool_calls = message[\"kwargs\"].get(\"tool_calls\", [])\n", - " for tool_call in tool_calls:\n", - " if tool_call:\n", - " function_calls.append(\n", - " {\n", - " \"tool_name\": tool_call.get(\"name\"),\n", - " \"tool_input\": tool_call.get(\"args\"),\n", - " }\n", - " )\n", - "\n", - " final_output[\"predicted_trajectory\"] = function_calls\n", - " return final_output\n", - "\n", - "\n", - "def format_output_as_markdown(output: dict) -> str:\n", - " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", - " markdown = \"### AI Response\\n\"\n", - " markdown += f\"{output['response']}\\n\\n\"\n", - "\n", - " if output[\"predicted_trajectory\"]:\n", - " markdown += \"### Function Calls\\n\"\n", - " for call in output[\"predicted_trajectory\"]:\n", - " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", - " markdown += \" - **Arguments**:\\n\"\n", - " for key, value in call[\"tool_input\"].items():\n", - " markdown += f\" - `{key}`: `{value}`\\n\"\n", - "\n", - " return markdown\n", - "\n", - "\n", - "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", - " \"\"\"Display the evaluation results.\"\"\"\n", - " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", - " display(Markdown(\"### Summary Metrics\"))\n", - " display(metrics_df)\n", - "\n", - " display(Markdown(f\"### Row-wise Metrics\"))\n", - " display(eval_result.metrics_table)\n", - "\n", - "\n", - "def display_drilldown(row: pd.Series) -> None:\n", - " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", - "\n", - " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", - "\n", - " if not (\n", - " isinstance(row[\"predicted_trajectory\"], list)\n", - " and isinstance(row[\"reference_trajectory\"], list)\n", - " ):\n", - " return\n", - "\n", - " for predicted_trajectory, reference_trajectory in zip(\n", - " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", - " ):\n", - " display(\n", - " HTML(\n", - " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", - " )\n", - " )\n", - "\n", - " if not (\n", - " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", - " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", - " ):\n", - " continue\n", - "\n", - " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", - " print(\"Tool Input Key: \", tool_input_key)\n", - "\n", - " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", - " print(\n", - " \"Tool Values: \",\n", - " predicted_trajectory[\"tool_input\"][tool_input_key],\n", - " reference_trajectory[\"tool_input\"][tool_input_key],\n", - " )\n", - " else:\n", - " print(\n", - " \"Tool Values: \",\n", - " predicted_trajectory[\"tool_input\"][tool_input_key],\n", - " \"N/A\",\n", - " )\n", - " print(\"\\n\")\n", - " display(HTML(\"
\"))\n", - "\n", - "\n", - "def display_dataframe_rows(\n", - " df: pd.DataFrame,\n", - " columns: list[str] | None = None,\n", - " num_rows: int = 3,\n", - " display_drilldown: bool = False,\n", - ") -> None:\n", - " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", - "\n", - " if columns:\n", - " df = df[columns]\n", - "\n", - " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", - " header_style = base_style + \"font-weight: bold;\"\n", - "\n", - " for _, row in df.head(num_rows).iterrows():\n", - " for column in df.columns:\n", - " display(\n", - " HTML(\n", - " f\"{column.replace('_', ' ').title()}: \"\n", - " )\n", - " )\n", - " display(HTML(f\"{row[column]}
\"))\n", - "\n", - " display(HTML(\"
\"))\n", - "\n", - " if (\n", - " display_drilldown\n", - " and \"predicted_trajectory\" in df.columns\n", - " and \"reference_trajectory\" in df.columns\n", - " ):\n", - " display_drilldown(row)\n", - "\n", - "\n", - "def plot_bar_plot(\n", - " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", - ") -> None:\n", - " fig = go.Figure()\n", - " data = []\n", - "\n", - " summary_metrics = eval_result.summary_metrics\n", - " if metrics:\n", - " summary_metrics = {\n", - " k: summary_metrics[k]\n", - " for k, v in summary_metrics.items()\n", - " if any(selected_metric in k for selected_metric in metrics)\n", - " }\n", - "\n", - " data.append(\n", - " go.Bar(\n", - " x=list(summary_metrics.keys()),\n", - " y=list(summary_metrics.values()),\n", - " name=title,\n", - " )\n", - " )\n", - "\n", - " fig = go.Figure(data=data)\n", - "\n", - " # Change the bar mode\n", - " fig.update_layout(barmode=\"group\")\n", - " fig.show()\n", - "\n", - "\n", - "def display_radar_plot(eval_results, title: str, metrics=None):\n", - " \"\"\"Plot the radar plot.\"\"\"\n", - " fig = go.Figure()\n", - " summary_metrics = eval_results.summary_metrics\n", - " if metrics:\n", - " summary_metrics = {\n", - " k: summary_metrics[k]\n", - " for k, v in summary_metrics.items()\n", - " if any(selected_metric in k for selected_metric in metrics)\n", - " }\n", - "\n", - " min_val = min(summary_metrics.values())\n", - " max_val = max(summary_metrics.values())\n", - "\n", - " fig.add_trace(\n", - " go.Scatterpolar(\n", - " r=list(summary_metrics.values()),\n", - " theta=list(summary_metrics.keys()),\n", - " fill=\"toself\",\n", - " name=title,\n", - " )\n", - " )\n", - " fig.update_layout(\n", - " title=title,\n", - " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", - " showlegend=True,\n", - " )\n", - " fig.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bDaa2Mtsifmq" - }, - "source": [ - "## Build LangGraph agent\n", - "\n", - "Build your application using LangGraph, including the Gemini model, custom tools that you define and a router to control the conversational flow." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KHwShhpOitKp" - }, - "source": [ - "### Set tools\n", - "\n", - "To start, set the tools that a customer support agent needs to do their job." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gA2ZKvfeislw" - }, - "outputs": [], - "source": [ - "@tool\n", - "def get_product_details(product_name: str):\n", - " \"\"\"Gathers basic details about a product.\"\"\"\n", - " details = {\n", - " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", - " \"usb charger\": \"A super fast and light usb charger\",\n", - " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", - " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", - " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", - " }\n", - " return details.get(product_name, \"Product details not found.\")\n", - "\n", - "\n", - "@tool\n", - "def get_product_price(product_name: str):\n", - " \"\"\"Gathers price about a product.\"\"\"\n", - " details = {\n", - " \"smartphone\": 500,\n", - " \"usb charger\": 10,\n", - " \"shoes\": 100,\n", - " \"headphones\": 50,\n", - " \"speaker\": 80,\n", - " }\n", - " return details.get(product_name, \"Product price not found.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "be70714d9fae" - }, - "source": [ - "### Define router\n", - "\n", - "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "516b5108d327" - }, - "outputs": [], - "source": [ - "def router(\n", - " state: list[BaseMessage],\n", - ") -> Literal[\"get_product_details\", \"get_product_price\", \"__end__\"]:\n", - " \"\"\"Initiates product details or price retrieval if the user asks for a product.\"\"\"\n", - " # Get the tool_calls from the last message in the conversation history.\n", - " tool_calls = state[-1].tool_calls\n", - "\n", - " # If there are any tool_calls\n", - " if tool_calls:\n", - " # Check the function name in the first tool call\n", - " function_name = tool_calls[0].get(\"name\")\n", - " if function_name == \"get_product_price\":\n", - " return \"get_product_price\"\n", - " else:\n", - " return \"get_product_details\"\n", - " else:\n", - " # End the conversation flow.\n", - " return \"__end__\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "l4mk5XPui4Y1" - }, - "source": [ - "### Set the model\n", - "\n", - "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "BaYeo6K2i-w1" - }, - "outputs": [], - "source": [ - "model = \"gemini-1.5-pro\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tNlAY9cojEWz" - }, - "source": [ - "### Assemble the agent\n", - "\n", - "The Vertex AI Gen AI Evaluation works directly with 'Queryable' agents, and also lets you add your own custom functions with a specific structure (signature).\n", - "\n", - "In this case, you assemble the agent using a custom function. The function triggers the agent for a given input and parse the agent outcome to extract the response and called tools." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dAFdi7SujGP8" - }, - "outputs": [], - "source": [ - "def agent_parsed_outcome(input):\n", - "\n", - " model = ChatVertexAI(model=model)\n", - " builder = MessageGraph()\n", - "\n", - " model_with_tools = model.bind_tools([get_product_details, get_product_price])\n", - " builder.add_node(\"tools\", model_with_tools)\n", - "\n", - " tool_node = ToolNode([get_product_details, get_product_price])\n", - " builder.add_node(\"get_product_details\", tool_node)\n", - " builder.add_node(\"get_product_price\", tool_node)\n", - " builder.add_edge(\"get_product_details\", END)\n", - " builder.add_edge(\"get_product_price\", END)\n", - "\n", - " builder.set_entry_point(\"tools\")\n", - " builder.add_conditional_edges(\"tools\", router)\n", - "\n", - " app = builder.compile()\n", - " chat_history = langchain_load_dump.dumpd(app.invoke(HumanMessage(input)))\n", - " return parse_messages_to_output_dictionary(chat_history)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_HGcs6PVjRj_" - }, - "source": [ - "### Test the agent\n", - "\n", - "Query your agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "lGb58OJkjUs9" - }, - "outputs": [], - "source": [ - "response = agent_parsed_outcome(input=\"Get product details for shoes\")\n", - "display(Markdown(format_output_as_markdown(response)))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2wCFstt8w4Dx" - }, - "outputs": [], - "source": [ - "response = agent_parsed_outcome(input=\"Get product price for shoes\")\n", - "display(Markdown(format_output_as_markdown(response)))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aOGPePsorpUl" - }, - "source": [ - "## Evaluating a LangGraph agent with Vertex AI Gen AI Evaluation\n", - "\n", - "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", - "\n", - "Monitoring focuses on how well your agent is performing specific tasks:\n", - "\n", - "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", - "\n", - "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", - "\n", - "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", - "\n", - "Observability is about understanding the overall health of the agent:\n", - "\n", - "* **Latency**: How long does it take the agent to respond?\n", - "\n", - "* **Failure Rate**: How often does the agent fail to produce a response?\n", - "\n", - "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e43229f3ad4f" - }, - "source": [ - "### Prepare Agent Evaluation dataset\n", - "\n", - "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", - "\n", - "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", - "\n", - "> Optionally, you can provide both generated responses and predicted trajectory (**Bring-Your-Own-Dataset scenario**).\n", - "\n", - "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fFf8uTdUiDt3" - }, - "outputs": [], - "source": [ - "eval_data = {\n", - " \"prompt\": [\n", - " \"Get price for smartphone\",\n", - " \"Get product details and price for headphones\",\n", - " \"Get details for usb charger\",\n", - " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\",\n", - " ],\n", - " \"reference_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - "}\n", - "\n", - "eval_sample_dataset = pd.DataFrame(eval_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PQEI1EcfvFHb" - }, - "source": [ - "Print some samples from the dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "EjsonqWWvIvE" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(eval_sample_dataset, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "m4CvBuf1afHG" - }, - "source": [ - "### Single tool usage evaluation\n", - "\n", - "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_rS5GGKHd5bx" - }, - "source": [ - "#### Set single tool usage metrics\n", - "\n", - "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", - "\n", - "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xixvq8dwd5by" - }, - "outputs": [], - "source": [ - "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ktKZoT2Qd5by" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using agent_parsed_outcome function and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SRv43fDcd5by" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"\n", - "\n", - "single_tool_call_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=single_tool_usage_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", - " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(single_tool_call_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6o5BjSTFKVMS" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Use some helper functions to visualize a sample of evaluation result." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1Jopzw83k14w" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JlujdJpu5Kn6" - }, - "source": [ - "### Trajectory Evaluation\n", - "\n", - "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8s-nHdDJneHM" - }, - "source": [ - "#### Set trajectory metrics\n", - "\n", - "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", - "\n", - "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", - "\n", - "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", - "\n", - "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", - "\n", - "* `trajectory_precision`: proportion of predicted actions present in reference\n", - "\n", - "* `trajectory_recall`: proportion of reference actions present in predicted. \n", - "\n", - "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "c32WIS95neHN" - }, - "outputs": [], - "source": [ - "trajectory_metrics = [\n", - " \"trajectory_exact_match\",\n", - " \"trajectory_in_order_match\",\n", - " \"trajectory_any_order_match\",\n", - " \"trajectory_precision\",\n", - " \"trajectory_recall\",\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF3jhTH3neHN" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vOdS7TJUneHN" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"trajectory-{get_id()}\"\n", - "\n", - "trajectory_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", - ")\n", - "\n", - "trajectory_eval_result = trajectory_eval_task.evaluate(\n", - " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(trajectory_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DBiUI3LyLBtj" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Print and visualize a sample of evaluation results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sLVRdN5llA0h" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "erYYZEaaTNjJ" - }, - "outputs": [], - "source": [ - "plot_bar_plot(\n", - " trajectory_eval_result,\n", - " title=\"Trajectory Metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "T8TipU2akHEd" - }, - "source": [ - "### Evaluate final response\n", - "\n", - "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DeK-py7ykkDN" - }, - "source": [ - "#### Set response metrics\n", - "\n", - "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", - "\n", - "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cyGHGgeVklvz" - }, - "outputs": [], - "source": [ - "response_metrics = [\"safety\", \"coherence\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DaBJWcg1kn55" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wRb2EC_hknSD" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"response-{get_id()}\"\n", - "\n", - "response_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", - ")\n", - "\n", - "response_eval_result = response_eval_task.evaluate(\n", - " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(response_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WOP9hW-rTUIU" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "\n", - "Print new evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZODTRuq2lF75" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ntRBK3Te6PEc" - }, - "source": [ - "### Evaluate generated response conditioned by tool choosing\n", - "\n", - "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", - "\n", - "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4bENwFcd6prX" - }, - "source": [ - "#### Define a custom metric\n", - "\n", - "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", - "\n", - "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "txGEHcg76riI" - }, - "outputs": [], - "source": [ - "criteria = {\n", - " \"Follows trajectory\": (\n", - " \"Evaluate whether the agent's response logically follows from the \"\n", - " \"sequence of actions it took. Consider these sub-points:\\n\"\n", - " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", - " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", - " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", - " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", - " )\n", - "}\n", - "\n", - "pointwise_rating_rubric = {\n", - " \"1\": \"Follows trajectory\",\n", - " \"0\": \"Does not follow trajectory\",\n", - "}\n", - "\n", - "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", - " criteria=criteria,\n", - " rating_rubric=pointwise_rating_rubric,\n", - " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8MJqXu0kikxd" - }, - "source": [ - "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5EL7iEDMikNQ" - }, - "outputs": [], - "source": [ - "print(response_follows_trajectory_prompt_template.prompt_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e1djVp7Fi4Yy" - }, - "source": [ - "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nx1xbZD87iMj" - }, - "outputs": [], - "source": [ - "response_follows_trajectory_metric = PointwiseMetric(\n", - " metric=\"response_follows_trajectory\",\n", - " metric_prompt_template=response_follows_trajectory_prompt_template,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1pmxLwTe7Ywv" - }, - "source": [ - "#### Set response metrics\n", - "\n", - "Set new generated response evaluation metrics by including the custom metric.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wrsbVFDd7Ywv" - }, - "outputs": [], - "source": [ - "response_tool_metrics = [\n", - " \"trajectory_exact_match\",\n", - " \"trajectory_in_order_match\",\n", - " \"safety\",\n", - " response_follows_trajectory_metric,\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Lo-Sza807Ywv" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "Run a new agent's evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_dkb4gSn7Ywv" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"response-over-tools-{get_id()}\"\n", - "\n", - "response_eval_tool_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "response_eval_tool_result = response_eval_tool_task.evaluate(\n", - " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(response_eval_tool_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AtOfIFi2j88g" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Visualize evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GH2YvXgLlLH7" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(response_eval_tool_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4nuUDP3a2eTB" - }, - "source": [ - "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", - "\n", - "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DRLKlmWd27PK" - }, - "source": [ - "### Bring your own evaluation dataset\n", - "\n", - "Define the evaluation dataset with the predicted trajectory and the generated response." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "y9hBgsg324Ej" - }, - "outputs": [], - "source": [ - "byod_eval_data = {\n", - " \"prompt\": [\n", - " \"Get price for smartphone\",\n", - " \"Get product details and price for headphones\",\n", - " \"Get details for usb charger\",\n", - " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\",\n", - " ],\n", - " \"reference_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - " \"predicted_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - " \"response\": [\n", - " 500,\n", - " 50,\n", - " \"A super fast and light usb charger\",\n", - " 100,\n", - " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", - " ],\n", - "}\n", - "\n", - "byod_eval_sample_dataset = pd.DataFrame(eval_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oEYmU2eJ7q-1" - }, - "source": [ - "### Run an evaluation task\n", - "\n", - "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wBD-4wpB7q-3" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", - "\n", - "byod_response_eval_tool_task = EvalTask(\n", - " dataset=byod_eval_sample_dataset,\n", - " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(experiment_run_name=EXPERIMENT_RUN_NAME)\n", - "\n", - "display_eval_report(byod_response_eval_tool_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9eU3LG6r7q-3" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Visualize evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pQFzmd2I7q-3" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "DJr8GqQKTpUa" - }, - "outputs": [], - "source": [ - "display_radar_plot(\n", - " byod_response_eval_tool_result,\n", - " title=\"Agent evaluation metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2a4e033321ad" - }, - "source": [ - "## Cleaning up\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Ox2I3UfRlTOd" - }, - "outputs": [], - "source": [ - "delete_experiment = True\n", - "\n", - "if delete_experiment:\n", - " try:\n", - " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", - " experiment.delete(delete_backing_tensorboard_runs=True)\n", - " except Exception as e:\n", - " print(e)" - ] - } - ], - "metadata": { - "colab": { - "name": "evaluating_langgraph_agent.ipynb", - "toc_visible": true - }, - "environment": { - "kernel": "python3", - "name": "common-cpu.m126", - "type": "gcloud", - "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/base-cpu:m126" - }, - "kernelspec": { - "display_name": "Python 3 (Local)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 4 + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Evaluating Agents - Evaluate a LangGraph agent with Vertex AI Gen AI Evaluation\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Google
Open in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Open in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "Share to:\n", + "\n", + "\n", + " \"LinkedIn\n", + "\n", + "\n", + "\n", + " \"Bluesky\n", + "\n", + "\n", + "\n", + " \"X\n", + "\n", + "\n", + "\n", + " \"Reddit\n", + "\n", + "\n", + "\n", + " \"Facebook\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "84f0f73a0f76" + }, + "source": [ + "| | |\n", + "|-|-|\n", + "| Authors | [Ivan Nardini](https://github.com/inardini) [Naveksha Sood](https://github.com/navekshasood)|" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", + "\n", + "Vertex AI Model Evaluation provides a toolkit of quality controlled and explainable methods and metrics to evaluate any generative model or application, including agents, and benchmark the evaluation results against your own judgment, using your own evaluation criteria.\n", + "\n", + "This tutorial shows how to evaluate a LangGraph agent using Vertex AI Gen AI Evaluation for agent evaluation.\n", + "\n", + "The tutorial uses the following Google Cloud services and resources:\n", + "\n", + "* Vertex AI Gen AI Evaluation\n", + "\n", + "The steps performed include:\n", + "\n", + "* Build local agent using LangGraph\n", + "* Prepare Agent Evaluation dataset\n", + "* Single tool usage evaluation\n", + "* Trajectory evaluation\n", + "* Response evaluation\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "61RBz8LLbxCR" + }, + "source": [ + "## Get started" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "No17Cw5hgx12" + }, + "source": [ + "### Install Vertex AI SDK and other required packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tFy3H3aPgx12" + }, + "outputs": [], + "source": [ + "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation]\" \\\n", + " \"langchain_google_vertexai\" \\\n", + " \"langgraph\" \\\n", + " \"cloudpickle==3.0.0\" \\\n", + " \"pydantic==2.7.4\" \\\n", + " \"requests\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R5Xep4W9lq-Z" + }, + "source": [ + "### Restart runtime\n", + "\n", + "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", + "\n", + "The restart might take a minute or longer. After it's restarted, continue to the next step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XRvKdaPDTznN" + }, + "outputs": [], + "source": [ + "import IPython\n", + "\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SbmM4z7FOBpM" + }, + "source": [ + "
\n", + "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dmWOrTJ3gx13" + }, + "source": [ + "### Authenticate your notebook environment (Colab only)\n", + "\n", + "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NyKGtVQjgx13" + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " from google.colab import auth\n", + "\n", + " auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF4l8DTdWgPY" + }, + "source": [ + "### Set Google Cloud project information and initialize Vertex AI SDK\n", + "\n", + "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nqwi-5ufWp_B" + }, + "outputs": [], + "source": [ + "# Use the environment variable if the user doesn't provide Project ID.\n", + "import os\n", + "\n", + "import vertexai\n", + "\n", + "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", + "\n", + "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", + " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", + "\n", + "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", + "\n", + "EXPERIMENT_NAME = \"evaluate-langgraph-agent\" # @param {type:\"string\"}\n", + "\n", + "vertexai.init(project=PROJECT_ID, location=LOCATION, experiment=EXPERIMENT_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5303c05f7aa6" + }, + "source": [ + "## Import libraries\n", + "\n", + "Import tutorial libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6fc324893334" + }, + "outputs": [], + "source": [ + "# General\n", + "import random\n", + "import string\n", + "from typing import Literal\n", + "\n", + "from IPython.display import HTML, Markdown, display\n", + "\n", + "# Evaluate agent\n", + "from google.cloud import aiplatform\n", + "from langchain.load import dump as langchain_load_dump\n", + "\n", + "# Build agent\n", + "from langchain_core.messages import BaseMessage, HumanMessage\n", + "from langchain_core.tools import tool\n", + "from langchain_google_vertexai import ChatVertexAI\n", + "from langgraph.graph import END, MessageGraph\n", + "from langgraph.prebuilt import ToolNode\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "from vertexai.preview.evaluation import EvalTask\n", + "from vertexai.preview.evaluation.metrics import (\n", + " PointwiseMetric,\n", + " PointwiseMetricPromptTemplate,\n", + " TrajectorySingleToolUse,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MVnBDX54gz7j" + }, + "source": [ + "## Define helper functions\n", + "\n", + "Initiate a set of helper functions to print tutorial results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uSgWjMD_g1_v" + }, + "outputs": [], + "source": [ + "def get_id(length: int = 8) -> str:\n", + " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", + " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", + "\n", + "\n", + "def parse_messages_to_output_dictionary(messages: list[dict]) -> dict:\n", + " \"\"\"Parse response and function calls from a list of messages in the constructor format.\"\"\"\n", + "\n", + " final_output = {\n", + " \"response\": \"No AI response found in the message history.\",\n", + " \"predicted_trajectory\": [],\n", + " }\n", + "\n", + " # Process each message\n", + " function_calls = []\n", + " for message in messages:\n", + " # Check if it's a Tool message which contains the actual response\n", + " if message.get(\"type\") == \"constructor\" and \"ToolMessage\" in message.get(\n", + " \"id\", []\n", + " ):\n", + " final_output[\"response\"] = message[\"kwargs\"][\"content\"]\n", + "\n", + " # Check if it's an AI message to get tool calls\n", + " elif message.get(\"type\") == \"constructor\" and \"AIMessage\" in message.get(\n", + " \"id\", []\n", + " ):\n", + " tool_calls = message[\"kwargs\"].get(\"tool_calls\", [])\n", + " for tool_call in tool_calls:\n", + " if tool_call:\n", + " function_calls.append(\n", + " {\n", + " \"tool_name\": tool_call.get(\"name\"),\n", + " \"tool_input\": tool_call.get(\"args\"),\n", + " }\n", + " )\n", + "\n", + " final_output[\"predicted_trajectory\"] = function_calls\n", + " return final_output\n", + "\n", + "\n", + "def format_output_as_markdown(output: dict) -> str:\n", + " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", + " markdown = \"### AI Response\\n\"\n", + " markdown += f\"{output['response']}\\n\\n\"\n", + "\n", + " if output[\"predicted_trajectory\"]:\n", + " markdown += \"### Function Calls\\n\"\n", + " for call in output[\"predicted_trajectory\"]:\n", + " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", + " markdown += \" - **Arguments**:\\n\"\n", + " for key, value in call[\"tool_input\"].items():\n", + " markdown += f\" - `{key}`: `{value}`\\n\"\n", + "\n", + " return markdown\n", + "\n", + "\n", + "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", + " \"\"\"Display the evaluation results.\"\"\"\n", + " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", + " display(Markdown(\"### Summary Metrics\"))\n", + " display(metrics_df)\n", + "\n", + " display(Markdown(f\"### Row-wise Metrics\"))\n", + " display(eval_result.metrics_table)\n", + "\n", + "\n", + "def display_drilldown(row: pd.Series) -> None:\n", + " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", + "\n", + " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", + "\n", + " if not (\n", + " isinstance(row[\"predicted_trajectory\"], list)\n", + " and isinstance(row[\"reference_trajectory\"], list)\n", + " ):\n", + " return\n", + "\n", + " for predicted_trajectory, reference_trajectory in zip(\n", + " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", + " ):\n", + " display(\n", + " HTML(\n", + " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", + " )\n", + " )\n", + "\n", + " if not (\n", + " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", + " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", + " ):\n", + " continue\n", + "\n", + " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", + " print(\"Tool Input Key: \", tool_input_key)\n", + "\n", + " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " reference_trajectory[\"tool_input\"][tool_input_key],\n", + " )\n", + " else:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " \"N/A\",\n", + " )\n", + " print(\"\\n\")\n", + " display(HTML(\"
\"))\n", + "\n", + "\n", + "def display_dataframe_rows(\n", + " df: pd.DataFrame,\n", + " columns: list[str] | None = None,\n", + " num_rows: int = 3,\n", + " display_drilldown: bool = False,\n", + ") -> None:\n", + " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", + "\n", + " if columns:\n", + " df = df[columns]\n", + "\n", + " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", + " header_style = base_style + \"font-weight: bold;\"\n", + "\n", + " for _, row in df.head(num_rows).iterrows():\n", + " for column in df.columns:\n", + " display(\n", + " HTML(\n", + " f\"{column.replace('_', ' ').title()}: \"\n", + " )\n", + " )\n", + " display(HTML(f\"{row[column]}
\"))\n", + "\n", + " display(HTML(\"
\"))\n", + "\n", + " if (\n", + " display_drilldown\n", + " and \"predicted_trajectory\" in df.columns\n", + " and \"reference_trajectory\" in df.columns\n", + " ):\n", + " display_drilldown(row)\n", + "\n", + "\n", + "def plot_bar_plot(\n", + " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", + ") -> None:\n", + " fig = go.Figure()\n", + " data = []\n", + "\n", + " summary_metrics = eval_result.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " data.append(\n", + " go.Bar(\n", + " x=list(summary_metrics.keys()),\n", + " y=list(summary_metrics.values()),\n", + " name=title,\n", + " )\n", + " )\n", + "\n", + " fig = go.Figure(data=data)\n", + "\n", + " # Change the bar mode\n", + " fig.update_layout(barmode=\"group\")\n", + " fig.show()\n", + "\n", + "\n", + "def display_radar_plot(eval_results, title: str, metrics=None):\n", + " \"\"\"Plot the radar plot.\"\"\"\n", + " fig = go.Figure()\n", + " summary_metrics = eval_results.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " min_val = min(summary_metrics.values())\n", + " max_val = max(summary_metrics.values())\n", + "\n", + " fig.add_trace(\n", + " go.Scatterpolar(\n", + " r=list(summary_metrics.values()),\n", + " theta=list(summary_metrics.keys()),\n", + " fill=\"toself\",\n", + " name=title,\n", + " )\n", + " )\n", + " fig.update_layout(\n", + " title=title,\n", + " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", + " showlegend=True,\n", + " )\n", + " fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bDaa2Mtsifmq" + }, + "source": [ + "## Build LangGraph agent\n", + "\n", + "Build your application using LangGraph, including the Gemini model, custom tools that you define and a router to control the conversational flow." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KHwShhpOitKp" + }, + "source": [ + "### Set tools\n", + "\n", + "To start, set the tools that a customer support agent needs to do their job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gA2ZKvfeislw" + }, + "outputs": [], + "source": [ + "@tool\n", + "def get_product_details(product_name: str):\n", + " \"\"\"Gathers basic details about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", + " \"usb charger\": \"A super fast and light usb charger\",\n", + " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", + " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", + " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " }\n", + " return details.get(product_name, \"Product details not found.\")\n", + "\n", + "\n", + "@tool\n", + "def get_product_price(product_name: str):\n", + " \"\"\"Gathers price about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": 500,\n", + " \"usb charger\": 10,\n", + " \"shoes\": 100,\n", + " \"headphones\": 50,\n", + " \"speaker\": 80,\n", + " }\n", + " return details.get(product_name, \"Product price not found.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "be70714d9fae" + }, + "source": [ + "### Define router\n", + "\n", + "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "516b5108d327" + }, + "outputs": [], + "source": [ + "def router(\n", + " state: list[BaseMessage],\n", + ") -> Literal[\"get_product_details\", \"get_product_price\", \"__end__\"]:\n", + " \"\"\"Initiates product details or price retrieval if the user asks for a product.\"\"\"\n", + " # Get the tool_calls from the last message in the conversation history.\n", + " tool_calls = state[-1].tool_calls\n", + "\n", + " # If there are any tool_calls\n", + " if tool_calls:\n", + " # Check the function name in the first tool call\n", + " function_name = tool_calls[0].get(\"name\")\n", + " if function_name == \"get_product_price\":\n", + " return \"get_product_price\"\n", + " else:\n", + " return \"get_product_details\"\n", + " else:\n", + " # End the conversation flow.\n", + " return \"__end__\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l4mk5XPui4Y1" + }, + "source": [ + "### Set the model\n", + "\n", + "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BaYeo6K2i-w1" + }, + "outputs": [], + "source": [ + "model = \"gemini-1.5-pro\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNlAY9cojEWz" + }, + "source": [ + "### Assemble the agent\n", + "\n", + "The Vertex AI Gen AI Evaluation works directly with 'Queryable' agents, and also lets you add your own custom functions with a specific structure (signature).\n", + "\n", + "In this case, you assemble the agent using a custom function. The function triggers the agent for a given input and parse the agent outcome to extract the response and called tools." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dAFdi7SujGP8" + }, + "outputs": [], + "source": [ + "def agent_parsed_outcome(input):\n", + "\n", + " model = ChatVertexAI(model=model)\n", + " builder = MessageGraph()\n", + "\n", + " model_with_tools = model.bind_tools([get_product_details, get_product_price])\n", + " builder.add_node(\"tools\", model_with_tools)\n", + "\n", + " tool_node = ToolNode([get_product_details, get_product_price])\n", + " builder.add_node(\"get_product_details\", tool_node)\n", + " builder.add_node(\"get_product_price\", tool_node)\n", + " builder.add_edge(\"get_product_details\", END)\n", + " builder.add_edge(\"get_product_price\", END)\n", + "\n", + " builder.set_entry_point(\"tools\")\n", + " builder.add_conditional_edges(\"tools\", router)\n", + "\n", + " app = builder.compile()\n", + " chat_history = langchain_load_dump.dumpd(app.invoke(HumanMessage(input)))\n", + " return parse_messages_to_output_dictionary(chat_history)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_HGcs6PVjRj_" + }, + "source": [ + "### Test the agent\n", + "\n", + "Query your agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lGb58OJkjUs9" + }, + "outputs": [], + "source": [ + "response = agent_parsed_outcome(input=\"Get product details for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2wCFstt8w4Dx" + }, + "outputs": [], + "source": [ + "response = agent_parsed_outcome(input=\"Get product price for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aOGPePsorpUl" + }, + "source": [ + "## Evaluating a LangGraph agent with Vertex AI Gen AI Evaluation\n", + "\n", + "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", + "\n", + "Monitoring focuses on how well your agent is performing specific tasks:\n", + "\n", + "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", + "\n", + "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", + "\n", + "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", + "\n", + "Observability is about understanding the overall health of the agent:\n", + "\n", + "* **Latency**: How long does it take the agent to respond?\n", + "\n", + "* **Failure Rate**: How often does the agent fail to produce a response?\n", + "\n", + "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e43229f3ad4f" + }, + "source": [ + "### Prepare Agent Evaluation dataset\n", + "\n", + "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", + "\n", + "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", + "\n", + "> Optionally, you can provide both generated responses and predicted trajectory (**Bring-Your-Own-Dataset scenario**).\n", + "\n", + "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fFf8uTdUiDt3" + }, + "outputs": [], + "source": [ + "eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + "}\n", + "\n", + "eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PQEI1EcfvFHb" + }, + "source": [ + "Print some samples from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EjsonqWWvIvE" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(eval_sample_dataset, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m4CvBuf1afHG" + }, + "source": [ + "### Single tool usage evaluation\n", + "\n", + "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_rS5GGKHd5bx" + }, + "source": [ + "#### Set single tool usage metrics\n", + "\n", + "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", + "\n", + "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xixvq8dwd5by" + }, + "outputs": [], + "source": [ + "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ktKZoT2Qd5by" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using agent_parsed_outcome function and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SRv43fDcd5by" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"\n", + "\n", + "single_tool_call_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=single_tool_usage_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(single_tool_call_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6o5BjSTFKVMS" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Use some helper functions to visualize a sample of evaluation result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1Jopzw83k14w" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JlujdJpu5Kn6" + }, + "source": [ + "### Trajectory Evaluation\n", + "\n", + "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8s-nHdDJneHM" + }, + "source": [ + "#### Set trajectory metrics\n", + "\n", + "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", + "\n", + "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", + "\n", + "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", + "\n", + "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", + "\n", + "* `trajectory_precision`: proportion of predicted actions present in reference\n", + "\n", + "* `trajectory_recall`: proportion of reference actions present in predicted. \n", + "\n", + "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c32WIS95neHN" + }, + "outputs": [], + "source": [ + "trajectory_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"trajectory_any_order_match\",\n", + " \"trajectory_precision\",\n", + " \"trajectory_recall\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF3jhTH3neHN" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vOdS7TJUneHN" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"trajectory-{get_id()}\"\n", + "\n", + "trajectory_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "trajectory_eval_result = trajectory_eval_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(trajectory_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DBiUI3LyLBtj" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Print and visualize a sample of evaluation results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sLVRdN5llA0h" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "erYYZEaaTNjJ" + }, + "outputs": [], + "source": [ + "plot_bar_plot(\n", + " trajectory_eval_result,\n", + " title=\"Trajectory Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T8TipU2akHEd" + }, + "source": [ + "### Evaluate final response\n", + "\n", + "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DeK-py7ykkDN" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", + "\n", + "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cyGHGgeVklvz" + }, + "outputs": [], + "source": [ + "response_metrics = [\"safety\", \"coherence\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DaBJWcg1kn55" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wRb2EC_hknSD" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"response-{get_id()}\"\n", + "\n", + "response_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "response_eval_result = response_eval_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(response_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WOP9hW-rTUIU" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZODTRuq2lF75" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ntRBK3Te6PEc" + }, + "source": [ + "### Evaluate generated response conditioned by tool choosing\n", + "\n", + "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", + "\n", + "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4bENwFcd6prX" + }, + "source": [ + "#### Define a custom metric\n", + "\n", + "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", + "\n", + "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "txGEHcg76riI" + }, + "outputs": [], + "source": [ + "criteria = {\n", + " \"Follows trajectory\": (\n", + " \"Evaluate whether the agent's response logically follows from the \"\n", + " \"sequence of actions it took. Consider these sub-points:\\n\"\n", + " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", + " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", + " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", + " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", + " )\n", + "}\n", + "\n", + "pointwise_rating_rubric = {\n", + " \"1\": \"Follows trajectory\",\n", + " \"0\": \"Does not follow trajectory\",\n", + "}\n", + "\n", + "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", + " criteria=criteria,\n", + " rating_rubric=pointwise_rating_rubric,\n", + " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8MJqXu0kikxd" + }, + "source": [ + "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5EL7iEDMikNQ" + }, + "outputs": [], + "source": [ + "print(response_follows_trajectory_prompt_template.prompt_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e1djVp7Fi4Yy" + }, + "source": [ + "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nx1xbZD87iMj" + }, + "outputs": [], + "source": [ + "response_follows_trajectory_metric = PointwiseMetric(\n", + " metric=\"response_follows_trajectory\",\n", + " metric_prompt_template=response_follows_trajectory_prompt_template,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1pmxLwTe7Ywv" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "Set new generated response evaluation metrics by including the custom metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wrsbVFDd7Ywv" + }, + "outputs": [], + "source": [ + "response_tool_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"safety\",\n", + " response_follows_trajectory_metric,\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lo-Sza807Ywv" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_dkb4gSn7Ywv" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"response-over-tools-{get_id()}\"\n", + "\n", + "response_eval_tool_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "response_eval_tool_result = response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_outcome, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AtOfIFi2j88g" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GH2YvXgLlLH7" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4nuUDP3a2eTB" + }, + "source": [ + "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", + "\n", + "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DRLKlmWd27PK" + }, + "source": [ + "### Bring your own evaluation dataset\n", + "\n", + "Define the evaluation dataset with the predicted trajectory and the generated response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y9hBgsg324Ej" + }, + "outputs": [], + "source": [ + "byod_eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"predicted_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"response\": [\n", + " 500,\n", + " 50,\n", + " \"A super fast and light usb charger\",\n", + " 100,\n", + " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " ],\n", + "}\n", + "\n", + "byod_eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oEYmU2eJ7q-1" + }, + "source": [ + "### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wBD-4wpB7q-3" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", + "\n", + "byod_response_eval_tool_task = EvalTask(\n", + " dataset=byod_eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", + " experiment_run_name=EXPERIMENT_RUN_NAME\n", + ")\n", + "\n", + "display_eval_report(byod_response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9eU3LG6r7q-3" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pQFzmd2I7q-3" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DJr8GqQKTpUa" + }, + "outputs": [], + "source": [ + "display_radar_plot(\n", + " byod_response_eval_tool_result,\n", + " title=\"Agent evaluation metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2a4e033321ad" + }, + "source": [ + "## Cleaning up\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ox2I3UfRlTOd" + }, + "outputs": [], + "source": [ + "delete_experiment = True\n", + "\n", + "if delete_experiment:\n", + " try:\n", + " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", + " experiment.delete(delete_backing_tensorboard_runs=True)\n", + " except Exception as e:\n", + " print(e)" + ] + } + ], + "metadata": { + "colab": { + "name": "evaluating_langgraph_agent.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb b/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb index 1f701dac00..292e5f25cc 100644 --- a/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb +++ b/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb @@ -1,1711 +1,1694 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ur8xi4C7S06n" - }, - "outputs": [], - "source": [ - "# Copyright 2024 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JAPoU8Sm5E6e" - }, - "source": [ - "# Evaluate a CrewAI agent on Vertex AI Reasoning Engine (Customized template)\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \"Google
Open in Colab\n", - "
\n", - "
\n", - " \n", - " \"Google
Open in Colab Enterprise\n", - "
\n", - "
\n", - " \n", - " \"Vertex
Open in Vertex AI Workbench\n", - "
\n", - "
\n", - " \n", - " \"GitHub
View on GitHub\n", - "
\n", - "
\n", - "\n", - "
\n", - "\n", - "Share to:\n", - "\n", - "\n", - " \"LinkedIn\n", - "\n", - "\n", - "\n", - " \"Bluesky\n", - "\n", - "\n", - "\n", - " \"X\n", - "\n", - "\n", - "\n", - " \"Reddit\n", - "\n", - "\n", - "\n", - " \"Facebook\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "84f0f73a0f76" - }, - "source": [ - "| | |\n", - "|-|-|\n", - "| Authors | [Naveksha Sood](https://github.com/navekshasood) [Ivan Nardini](https://github.com/inardini) |" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tvgnzT1CKxrO" - }, - "source": [ - "## Overview\n", - "\n", - "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", - "\n", - "This tutorial shows how to evaluate a CrewAI agent with customized template on Vertex AI Reasoning Engine using Vertex AI Gen AI Evaluation.\n", - "\n", - "The tutorial uses the following Google Cloud services and resources:\n", - "\n", - "* Vertex AI Gen AI Evaluation\n", - "* Vertex AI Reasoning Engine\n", - "\n", - "The steps performed include:\n", - "\n", - "* Build and deploy an agent using CrewAI on Vertex AI Reasoning Engine\n", - "* Prepare Agent Evaluation dataset\n", - "* Single tool usage evaluation\n", - "* Trajectory evaluation\n", - "* Response evaluation\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "61RBz8LLbxCR" - }, - "source": [ - "## Get started" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "No17Cw5hgx12" - }, - "source": [ - "### Install Vertex AI SDK and other required packages\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tFy3H3aPgx12" - }, - "outputs": [], - "source": [ - "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation, reasoningengine]\" \\\n", - " \"crewai\" \"crewai-tools\" \\\n", - " \"cloudpickle==3.0.0\" \\\n", - " \"pydantic==2.7.4\" \\\n", - " \"requests\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "R5Xep4W9lq-Z" - }, - "source": [ - "### Restart runtime\n", - "\n", - "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", - "\n", - "The restart might take a minute or longer. After it's restarted, continue to the next step." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XRvKdaPDTznN" - }, - "outputs": [], - "source": [ - "import IPython\n", - "\n", - "app = IPython.Application.instance()\n", - "app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SbmM4z7FOBpM" - }, - "source": [ - "
\n", - "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", - "
\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dmWOrTJ3gx13" - }, - "source": [ - "### Authenticate your notebook environment (Colab only)\n", - "\n", - "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NyKGtVQjgx13" - }, - "outputs": [], - "source": [ - "import sys\n", - "\n", - "if \"google.colab\" in sys.modules:\n", - " from google.colab import auth\n", - "\n", - " auth.authenticate_user()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF4l8DTdWgPY" - }, - "source": [ - "### Set Google Cloud project information and initialize Vertex AI SDK\n", - "\n", - "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", - "\n", - "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nqwi-5ufWp_B" - }, - "outputs": [], - "source": [ - "# Use the environment variable if the user doesn't provide Project ID.\n", - "import os\n", - "\n", - "import vertexai\n", - "\n", - "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", - "\n", - "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", - " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", - "\n", - "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", - "\n", - "BUCKET_NAME = \"[your-bucket-name]\" # @param {type: \"string\", placeholder: \"[your-bucket-name]\", isTemplate: true}\n", - "\n", - "if not BUCKET_NAME or BUCKET_NAME == \"[your-bucket-name]\":\n", - " BUCKET_NAME = f\"{PROJECT_ID}-bucket\"\n", - "\n", - "! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI\n", - "\n", - "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", - "\n", - "vertexai.init(\n", - " project=PROJECT_ID,\n", - " location=LOCATION,\n", - " staging_bucket=BUCKET_URI,\n", - " experiment=EXPERIMENT_NAME,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5303c05f7aa6" - }, - "source": [ - "## Import libraries\n", - "\n", - "Import tutorial libraries." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6fc324893334" - }, - "outputs": [], - "source": [ - "# General\n", - "import random\n", - "import string\n", - "\n", - "from IPython.display import HTML, Markdown, display\n", - "\n", - "# Build agent\n", - "from crewai import Agent, Crew, Process, Task\n", - "from crewai.flow.flow import Flow, listen, start\n", - "from crewai_tools import tool\n", - "\n", - "# Evaluate agent\n", - "from google.cloud import aiplatform\n", - "import pandas as pd\n", - "import plotly.graph_objects as go\n", - "from vertexai.preview import reasoning_engines\n", - "from vertexai.preview.evaluation import EvalTask\n", - "from vertexai.preview.evaluation.metrics import (\n", - " PointwiseMetric,\n", - " PointwiseMetricPromptTemplate,\n", - " TrajectorySingleToolUse,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MVnBDX54gz7j" - }, - "source": [ - "## Define helper functions\n", - "\n", - "Initiate a set of helper functions to print tutorial results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uSgWjMD_g1_v" - }, - "outputs": [], - "source": [ - "def get_id(length: int = 8) -> str:\n", - " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", - " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", - "\n", - "\n", - "def parse_crewai_output_to_dictionary(crew, crew_output):\n", - " \"\"\"\n", - " Parse CrewAI output into a structured dictionary format.\n", - " \"\"\"\n", - " final_output = {\"response\": str(crew_output), \"predicted_trajectory\": []}\n", - "\n", - " try:\n", - " # Access tools_results directly from each agent\n", - " for agent in crew.agents:\n", - " if hasattr(agent, \"tools_results\"):\n", - " for tool_result in agent.tools_results:\n", - " tool_info = {\n", - " \"tool_name\": tool_result.get(\"tool_name\", \"\"),\n", - " \"tool_input\": tool_result.get(\"tool_args\", {}),\n", - " }\n", - " final_output[\"predicted_trajectory\"].append(tool_info)\n", - "\n", - " except Exception as e:\n", - " final_output[\"error\"] = f\"Error parsing tools results: {str(e)}\"\n", - "\n", - " return final_output\n", - "\n", - "\n", - "def format_output_as_markdown(output: dict) -> str:\n", - " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", - " markdown = \"### AI Response\\n\"\n", - " markdown += f\"{output['response']}\\n\\n\"\n", - "\n", - " if output[\"predicted_trajectory\"]:\n", - " markdown += \"### Function Calls\\n\"\n", - " for call in output[\"predicted_trajectory\"]:\n", - " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", - " markdown += \" - **Arguments**:\\n\"\n", - " for key, value in call[\"tool_input\"].items():\n", - " markdown += f\" - `{key}`: `{value}`\\n\"\n", - "\n", - " return markdown\n", - "\n", - "\n", - "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", - " \"\"\"Display the evaluation results.\"\"\"\n", - " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", - " display(Markdown(\"### Summary Metrics\"))\n", - " display(metrics_df)\n", - "\n", - " display(Markdown(f\"### Row-wise Metrics\"))\n", - " display(eval_result.metrics_table)\n", - "\n", - "\n", - "def display_drilldown(row: pd.Series) -> None:\n", - " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", - "\n", - " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", - "\n", - " if not (\n", - " isinstance(row[\"predicted_trajectory\"], list)\n", - " and isinstance(row[\"reference_trajectory\"], list)\n", - " ):\n", - " return\n", - "\n", - " for predicted_trajectory, reference_trajectory in zip(\n", - " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", - " ):\n", - " display(\n", - " HTML(\n", - " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", - " )\n", - " )\n", - "\n", - " if not (\n", - " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", - " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", - " ):\n", - " continue\n", - "\n", - " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", - " print(\"Tool Input Key: \", tool_input_key)\n", - "\n", - " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", - " print(\n", - " \"Tool Values: \",\n", - " predicted_trajectory[\"tool_input\"][tool_input_key],\n", - " reference_trajectory[\"tool_input\"][tool_input_key],\n", - " )\n", - " else:\n", - " print(\n", - " \"Tool Values: \",\n", - " predicted_trajectory[\"tool_input\"][tool_input_key],\n", - " \"N/A\",\n", - " )\n", - " print(\"\\n\")\n", - " display(HTML(\"
\"))\n", - "\n", - "\n", - "def display_dataframe_rows(\n", - " df: pd.DataFrame,\n", - " columns: list[str] | None = None,\n", - " num_rows: int = 3,\n", - " display_drilldown: bool = False,\n", - ") -> None:\n", - " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", - "\n", - " if columns:\n", - " df = df[columns]\n", - "\n", - " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", - " header_style = base_style + \"font-weight: bold;\"\n", - "\n", - " for _, row in df.head(num_rows).iterrows():\n", - " for column in df.columns:\n", - " display(\n", - " HTML(\n", - " f\"{column.replace('_', ' ').title()}: \"\n", - " )\n", - " )\n", - " display(HTML(f\"{row[column]}
\"))\n", - "\n", - " display(HTML(\"
\"))\n", - "\n", - " if (\n", - " display_drilldown\n", - " and \"predicted_trajectory\" in df.columns\n", - " and \"reference_trajectory\" in df.columns\n", - " ):\n", - " display_drilldown(row)\n", - "\n", - "\n", - "def plot_bar_plot(\n", - " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", - ") -> None:\n", - " fig = go.Figure()\n", - " data = []\n", - "\n", - " summary_metrics = eval_result.summary_metrics\n", - " if metrics:\n", - " summary_metrics = {\n", - " k: summary_metrics[k]\n", - " for k, v in summary_metrics.items()\n", - " if any(selected_metric in k for selected_metric in metrics)\n", - " }\n", - "\n", - " data.append(\n", - " go.Bar(\n", - " x=list(summary_metrics.keys()),\n", - " y=list(summary_metrics.values()),\n", - " name=title,\n", - " )\n", - " )\n", - "\n", - " fig = go.Figure(data=data)\n", - "\n", - " # Change the bar mode\n", - " fig.update_layout(barmode=\"group\")\n", - " fig.show()\n", - "\n", - "\n", - "def display_radar_plot(eval_results, title: str, metrics=None):\n", - " \"\"\"Plot the radar plot.\"\"\"\n", - " fig = go.Figure()\n", - " summary_metrics = eval_results.summary_metrics\n", - " if metrics:\n", - " summary_metrics = {\n", - " k: summary_metrics[k]\n", - " for k, v in summary_metrics.items()\n", - " if any(selected_metric in k for selected_metric in metrics)\n", - " }\n", - "\n", - " min_val = min(summary_metrics.values())\n", - " max_val = max(summary_metrics.values())\n", - "\n", - " fig.add_trace(\n", - " go.Scatterpolar(\n", - " r=list(summary_metrics.values()),\n", - " theta=list(summary_metrics.keys()),\n", - " fill=\"toself\",\n", - " name=title,\n", - " )\n", - " )\n", - " fig.update_layout(\n", - " title=title,\n", - " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", - " showlegend=True,\n", - " )\n", - " fig.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bDaa2Mtsifmq" - }, - "source": [ - "## Build an agent using Vertex AI Reasoning Engine's customized template\n", - "\n", - "Build your application using CrewAI, including the Gemini model and custom tools that you define.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KHwShhpOitKp" - }, - "source": [ - "### Set tools\n", - "\n", - "To start, set the tools that a customer support agent needs to do their job." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gA2ZKvfeislw" - }, - "outputs": [], - "source": [ - "@tool\n", - "def get_product_details(product_name: str):\n", - " \"\"\"Gathers basic details about a product.\"\"\"\n", - " details = {\n", - " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", - " \"usb charger\": \"A super fast and light usb charger\",\n", - " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", - " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", - " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", - " }\n", - " return details.get(product_name, \"Product details not found.\")\n", - "\n", - "\n", - "@tool\n", - "def get_product_price(product_name: str):\n", - " \"\"\"Gathers price about a product.\"\"\"\n", - " details = {\n", - " \"smartphone\": 500,\n", - " \"usb charger\": 10,\n", - " \"shoes\": 100,\n", - " \"headphones\": 50,\n", - " \"speaker\": 80,\n", - " }\n", - " return details.get(product_name, \"Product price not found.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "be70714d9fae" - }, - "source": [ - "### Define router using Flow\n", - "\n", - "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "516b5108d327" - }, - "outputs": [], - "source": [ - "class ProductFlow(Flow):\n", - " @start\n", - " def begin_flow(self):\n", - " \"\"\"Starts the product information flow\"\"\"\n", - " return \"check_request\"\n", - "\n", - " @listen(\"check_request\")\n", - " def router(self, state: dict) -> str:\n", - " \"\"\"Routes the product request to appropriate handler\"\"\"\n", - " # Get the last message from the state\n", - " last_message = state.get(\"last_message\", {})\n", - " tool_calls = last_message.get(\"tool_calls\", [])\n", - "\n", - " if tool_calls:\n", - " function_name = tool_calls[0].get(\"name\")\n", - " if function_name == \"get_product_price\":\n", - " return \"get_product_price\"\n", - " else:\n", - " return \"get_product_details\"\n", - " return \"end\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FHjhBVx2cHWb" - }, - "source": [ - "### Set the model\n", - "\n", - "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "iCx9hbpccHWc" - }, - "outputs": [], - "source": [ - "model = \"vertex_ai/gemini-1.5-pro-002\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tNlAY9cojEWz" - }, - "source": [ - "### Assemble the agent\n", - "\n", - "To create a CrewAI agent using [Vertex AI Reasoning Engine](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy), use the [customized template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize).\n", - "\n", - "This class helps you quickly get an agent using any framework running Vertex AI Reasoning Engine.\n", - "\n", - "To learn more about the template, check out [Customize an application template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize) documentation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dAFdi7SujGP8" - }, - "outputs": [], - "source": [ - "class CrewAIApp:\n", - " def __init__(self, project: str, location: str, model: str = model) -> None:\n", - " self.project_id = project\n", - " self.location = location\n", - " self.model = model\n", - "\n", - " # The set_up method is used to define application initialization logic\n", - " def set_up(self) -> None:\n", - " \"\"\"Set up the application.\"\"\"\n", - " os.environ[\"GOOGLE_CLOUD_PROJECT\"] = self.project_id\n", - " return\n", - "\n", - " # The query method will be used to send inputs to the agent\n", - " def query(self, input: str):\n", - " \"\"\"Query the application.\"\"\"\n", - " product_researcher = Agent(\n", - " role=\"Product Researcher\",\n", - " goal=\"Research product details and prices accurately\",\n", - " backstory=\"Expert at gathering and analyzing product information\",\n", - " llm=model,\n", - " tools=[get_product_details, get_product_price],\n", - " allow_delegation=False,\n", - " )\n", - "\n", - " research_task = Task(\n", - " description=f\"Analyze this user request: '{input}'. \"\n", - " f\"If the request is about price, use get_product_price tool. \"\n", - " f\"Otherwise, use get_product_details tool to get product information.\",\n", - " expected_output=\"Product information including details and/or price based on the user request.\",\n", - " agent=product_researcher,\n", - " )\n", - "\n", - " crew = Crew(\n", - " agents=[product_researcher],\n", - " tasks=[research_task],\n", - " process=Process.sequential,\n", - " )\n", - "\n", - " result = crew.kickoff()\n", - " return parse_crewai_output_to_dictionary(crew, result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_HGcs6PVjRj_" - }, - "source": [ - "### Test the local agent\n", - "\n", - "Query your agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1dXLLgBudu_L" - }, - "outputs": [], - "source": [ - "local_custom_agent = CrewAIApp(project=PROJECT_ID, location=LOCATION)\n", - "local_custom_agent.set_up()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PgkOhPmN3aCZ" - }, - "outputs": [], - "source": [ - "response = local_custom_agent.query(input=\"Get product details for shoes\")\n", - "display(Markdown(format_output_as_markdown(response)))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "lGb58OJkjUs9" - }, - "outputs": [], - "source": [ - "response = local_custom_agent.query(input=\"Get product price for shoes\")\n", - "display(Markdown(format_output_as_markdown(response)))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2pSItXD5e4QD" - }, - "source": [ - "### Deploy the local agent to Vertex AI Reasoning Engine\n", - "\n", - "To deploy the local agent on Vertex AI Reasoning Engine, you can use the `create` method by passing the agent and some specify dependencies (`requirements` for external PyPI packages and `extra_packages` for local packages ).\n", - "\n", - "Look at [Deploy the application](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy#create_a_reasoningengine_instance) documentation page to learn more. \n", - "\n", - "> The agent deployment on Vertex AI Reasoning Engine would require ~ 10 mins." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "3HLz_a1We4QE" - }, - "outputs": [], - "source": [ - "local_custom_agent = CrewAIApp(project=PROJECT_ID, location=LOCATION)\n", - "\n", - "remote_custom_agent = reasoning_engines.ReasoningEngine.create(\n", - " local_custom_agent,\n", - " requirements=[\n", - " \"google-cloud-aiplatform[reasoningengine]\",\n", - " \"crewai\",\n", - " \"crewai-tools\",\n", - " \"cloudpickle==3.0.0\",\n", - " \"pydantic==2.7.4\",\n", - " \"requests\",\n", - " ],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nu4RO1P9e4QE" - }, - "source": [ - "### Test the remote agent\n", - "\n", - "Query your remote agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sqBtzYyce4QE" - }, - "outputs": [], - "source": [ - "response = remote_custom_agent.query(input=\"Get product details for shoes\")\n", - "display(Markdown(format_output_as_markdown(response)))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aOGPePsorpUl" - }, - "source": [ - "## Evaluating a custom agent with Vertex AI Gen AI Evaluation\n", - "\n", - "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", - "\n", - "Monitoring focuses on how well your agent is performing specific tasks:\n", - "\n", - "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", - "\n", - "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", - "\n", - "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", - "\n", - "Observability is about understanding the overall health of the agent:\n", - "\n", - "* **Latency**: How long does it take the agent to respond?\n", - "\n", - "* **Failure Rate**: How often does the agent fail to produce a response?\n", - "\n", - "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e43229f3ad4f" - }, - "source": [ - "### Prepare Agent Evaluation dataset\n", - "\n", - "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", - "\n", - "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", - "\n", - "\n", - "> Optionally, you can provide both generated responses and predicted trajectory (**bring-your-own-dataset scenario**).\n", - "\n", - "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fFf8uTdUiDt3" - }, - "outputs": [], - "source": [ - "eval_data = {\n", - " \"prompt\": [\n", - " \"Get price for smartphone\",\n", - " \"Get product details and price for headphones\",\n", - " \"Get details for usb charger\",\n", - " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\",\n", - " ],\n", - " \"reference_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - "}\n", - "\n", - "eval_sample_dataset = pd.DataFrame(eval_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PQEI1EcfvFHb" - }, - "source": [ - "Print some samples from the dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "EjsonqWWvIvE" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(eval_sample_dataset, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "htCrOS9fRVi8" - }, - "source": [ - "### Prepare an Agent function\n", - "\n", - "In this scenario with a custom agent, you need an agent function to pass the agent output to Vertex AI Gen AI Evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GdO56MIDRZri" - }, - "outputs": [], - "source": [ - "def agent_parsed_response(input: str) -> dict:\n", - " \"\"\"Pass the agent output to Vertex AI Gen AI Evaluation.\"\"\"\n", - "\n", - " result = remote_custom_agent.query(input=input)\n", - "\n", - " return result" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "m4CvBuf1afHG" - }, - "source": [ - "### Single tool usage evaluation\n", - "\n", - "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_rS5GGKHd5bx" - }, - "source": [ - "#### Set single tool usage metrics\n", - "\n", - "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", - "\n", - "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xixvq8dwd5by" - }, - "outputs": [], - "source": [ - "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ktKZoT2Qd5by" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using a local agent (local_1p_agent) and assigns a unique identifier to this specific evaluation run, storing the evaluation results.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QaMf9dqzySE6" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SRv43fDcd5by" - }, - "outputs": [], - "source": [ - "single_tool_call_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=single_tool_usage_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", - " runnable=agent_parsed_response, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(single_tool_call_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6o5BjSTFKVMS" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Use some helper functions to visualize a sample of evaluation result." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZkpwPReipekr" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JlujdJpu5Kn6" - }, - "source": [ - "### Trajectory Evaluation\n", - "\n", - "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8s-nHdDJneHM" - }, - "source": [ - "#### Set trajectory metrics\n", - "\n", - "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", - "\n", - "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", - "\n", - "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", - "\n", - "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", - "\n", - "* `trajectory_precision`: proportion of predicted actions present in reference\n", - "\n", - "* `trajectory_recall`: proportion of reference actions present in predicted. \n", - "\n", - "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "c32WIS95neHN" - }, - "outputs": [], - "source": [ - "trajectory_metrics = [\n", - " \"trajectory_exact_match\",\n", - " \"trajectory_in_order_match\",\n", - " \"trajectory_any_order_match\",\n", - " \"trajectory_precision\",\n", - " \"trajectory_recall\",\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF3jhTH3neHN" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vOdS7TJUneHN" - }, - "outputs": [], - "source": [ - "EXPERIMENT_NAME = \"evaluate-re-agent-trajectory\" # @param {type:\"string\"}\n", - "\n", - "trajectory_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", - ")\n", - "\n", - "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=agent_parsed_response)\n", - "\n", - "display_eval_report(trajectory_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DBiUI3LyLBtj" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Print and visualize a sample of evaluation results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sLVRdN5llA0h" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PrxM5sMZYXHP" - }, - "outputs": [], - "source": [ - "plot_bar_plot(\n", - " trajectory_eval_result,\n", - " title=\"Trajectory Metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "T8TipU2akHEd" - }, - "source": [ - "### Evaluate final response\n", - "\n", - "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DeK-py7ykkDN" - }, - "source": [ - "#### Set response metrics\n", - "\n", - "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", - "\n", - "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cyGHGgeVklvz" - }, - "outputs": [], - "source": [ - "response_metrics = [\"safety\", \"coherence\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DaBJWcg1kn55" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wRb2EC_hknSD" - }, - "outputs": [], - "source": [ - "EXPERIMENT_NAME = \"evaluate-re-agent-response\" # @param {type:\"string\"}\n", - "\n", - "response_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", - ")\n", - "\n", - "response_eval_result = response_eval_task.evaluate(runnable=agent_parsed_response)\n", - "\n", - "display_eval_report(response_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JtewTwiwg9qH" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "\n", - "Print new evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cy0aRydrp9zW" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ntRBK3Te6PEc" - }, - "source": [ - "### Evaluate generated response conditioned by tool choosing\n", - "\n", - "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", - "\n", - "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4bENwFcd6prX" - }, - "source": [ - "#### Define a custom metric\n", - "\n", - "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", - "\n", - "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "txGEHcg76riI" - }, - "outputs": [], - "source": [ - "criteria = {\n", - " \"Follows trajectory\": (\n", - " \"Evaluate whether the agent's response logically follows from the \"\n", - " \"sequence of actions it took. Consider these sub-points:\\n\"\n", - " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", - " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", - " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", - " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", - " )\n", - "}\n", - "\n", - "pointwise_rating_rubric = {\n", - " \"1\": \"Follows trajectory\",\n", - " \"0\": \"Does not follow trajectory\",\n", - "}\n", - "\n", - "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", - " criteria=criteria,\n", - " rating_rubric=pointwise_rating_rubric,\n", - " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8MJqXu0kikxd" - }, - "source": [ - "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5EL7iEDMikNQ" - }, - "outputs": [], - "source": [ - "print(response_follows_trajectory_prompt_template.prompt_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e1djVp7Fi4Yy" - }, - "source": [ - "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nx1xbZD87iMj" - }, - "outputs": [], - "source": [ - "response_follows_trajectory_metric = PointwiseMetric(\n", - " metric=\"response_follows_trajectory\",\n", - " metric_prompt_template=response_follows_trajectory_prompt_template,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1pmxLwTe7Ywv" - }, - "source": [ - "#### Set response metrics\n", - "\n", - "Set new generated response evaluation metrics by including the custom metric.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wrsbVFDd7Ywv" - }, - "outputs": [], - "source": [ - "response_tool_metrics = [\n", - " \"trajectory_exact_match\",\n", - " \"trajectory_in_order_match\",\n", - " \"safety\",\n", - " response_follows_trajectory_metric,\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Lo-Sza807Ywv" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "Run a new agent's evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_dkb4gSn7Ywv" - }, - "outputs": [], - "source": [ - "EXPERIMENT_NAME = \"evaluate-re-agent-response-by-tools\" # @param {type:\"string\"}\n", - "\n", - "response_eval_tool_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "response_eval_tool_result = response_eval_tool_task.evaluate(\n", - " runnable=agent_parsed_response\n", - ")\n", - "\n", - "display_eval_report(response_eval_tool_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EFmnRBlWqJnC" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "\n", - "Print new evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZODTRuq2lF75" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4nuUDP3a2eTB" - }, - "source": [ - "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", - "\n", - "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pNh3-NDuZGDl" - }, - "source": [ - "### Bring your own evaluation dataset\n", - "\n", - "Define the evaluation dataset with the predicted trajectory and the generated response." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "y9hBgsg324Ej" - }, - "outputs": [], - "source": [ - "byod_eval_data = {\n", - " \"prompt\": [\n", - " \"Get price for smartphone\",\n", - " \"Get product details and price for headphones\",\n", - " \"Get details for usb charger\",\n", - " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\",\n", - " ],\n", - " \"reference_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - " \"predicted_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - " \"response\": [\n", - " 500,\n", - " 50,\n", - " \"A super fast and light usb charger\",\n", - " 100,\n", - " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", - " ],\n", - "}\n", - "\n", - "byod_eval_sample_dataset = pd.DataFrame(eval_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oEYmU2eJ7q-1" - }, - "source": [ - "### Run an evaluation task\n", - "\n", - "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wBD-4wpB7q-3" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", - "\n", - "byod_response_eval_tool_task = EvalTask(\n", - " dataset=byod_eval_sample_dataset,\n", - " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(experiment_run_name=EXPERIMENT_RUN_NAME)\n", - "\n", - "display_eval_report(byod_response_eval_tool_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9eU3LG6r7q-3" - }, - "source": [ - "### Visualize evaluation results\n", - "\n", - "Visualize evaluation result sample.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pQFzmd2I7q-3" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0FEbvEOkZS8f" - }, - "outputs": [], - "source": [ - "display_radar_plot(\n", - " byod_response_eval_tool_result,\n", - " title=\"Agent evaluation metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2a4e033321ad" - }, - "source": [ - "## Cleaning up\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Ox2I3UfRlTOd" - }, - "outputs": [], - "source": [ - "delete_experiment = True\n", - "delete_remote_agent = True\n", - "\n", - "if delete_experiment:\n", - " try:\n", - " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", - " experiment.delete(delete_backing_tensorboard_runs=True)\n", - " except Exception as e:\n", - " print(e)\n", - "\n", - "if delete_remote_agent:\n", - " try:\n", - " remote_custom_agent.delete()\n", - " except Exception as e:\n", - " print(e)" - ] - } - ], - "metadata": { - "colab": { - "name": "evaluating_crewai_agent_reasoning_engine_customized_template.ipynb", - "toc_visible": true - }, - "environment": { - "kernel": "python3", - "name": "common-cpu.m126", - "type": "gcloud", - "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/base-cpu:m126" - }, - "kernelspec": { - "display_name": "Python 3 (Local)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 4 + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Evaluate a CrewAI agent on Vertex AI Reasoning Engine (Customized template)\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Google
Open in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Open in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "Share to:\n", + "\n", + "\n", + " \"LinkedIn\n", + "\n", + "\n", + "\n", + " \"Bluesky\n", + "\n", + "\n", + "\n", + " \"X\n", + "\n", + "\n", + "\n", + " \"Reddit\n", + "\n", + "\n", + "\n", + " \"Facebook\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "84f0f73a0f76" + }, + "source": [ + "| | |\n", + "|-|-|\n", + "| Authors | [Naveksha Sood](https://github.com/navekshasood) [Ivan Nardini](https://github.com/inardini) |" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", + "\n", + "This tutorial shows how to evaluate a CrewAI agent with customized template on Vertex AI Reasoning Engine using Vertex AI Gen AI Evaluation.\n", + "\n", + "The tutorial uses the following Google Cloud services and resources:\n", + "\n", + "* Vertex AI Gen AI Evaluation\n", + "* Vertex AI Reasoning Engine\n", + "\n", + "The steps performed include:\n", + "\n", + "* Build and deploy an agent using CrewAI on Vertex AI Reasoning Engine\n", + "* Prepare Agent Evaluation dataset\n", + "* Single tool usage evaluation\n", + "* Trajectory evaluation\n", + "* Response evaluation\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "61RBz8LLbxCR" + }, + "source": [ + "## Get started" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "No17Cw5hgx12" + }, + "source": [ + "### Install Vertex AI SDK and other required packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tFy3H3aPgx12" + }, + "outputs": [], + "source": [ + "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation, reasoningengine]\" \\\n", + " \"crewai\" \"crewai-tools\" \\\n", + " \"cloudpickle==3.0.0\" \\\n", + " \"pydantic==2.7.4\" \\\n", + " \"requests\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R5Xep4W9lq-Z" + }, + "source": [ + "### Restart runtime\n", + "\n", + "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", + "\n", + "The restart might take a minute or longer. After it's restarted, continue to the next step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XRvKdaPDTznN" + }, + "outputs": [], + "source": [ + "import IPython\n", + "\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SbmM4z7FOBpM" + }, + "source": [ + "
\n", + "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dmWOrTJ3gx13" + }, + "source": [ + "### Authenticate your notebook environment (Colab only)\n", + "\n", + "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NyKGtVQjgx13" + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " from google.colab import auth\n", + "\n", + " auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF4l8DTdWgPY" + }, + "source": [ + "### Set Google Cloud project information and initialize Vertex AI SDK\n", + "\n", + "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nqwi-5ufWp_B" + }, + "outputs": [], + "source": [ + "# Use the environment variable if the user doesn't provide Project ID.\n", + "import os\n", + "\n", + "import vertexai\n", + "\n", + "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", + "\n", + "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", + " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", + "\n", + "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", + "\n", + "BUCKET_NAME = \"[your-bucket-name]\" # @param {type: \"string\", placeholder: \"[your-bucket-name]\", isTemplate: true}\n", + "\n", + "if not BUCKET_NAME or BUCKET_NAME == \"[your-bucket-name]\":\n", + " BUCKET_NAME = f\"{PROJECT_ID}-bucket\"\n", + "\n", + "! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI\n", + "\n", + "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", + "\n", + "vertexai.init(\n", + " project=PROJECT_ID,\n", + " location=LOCATION,\n", + " staging_bucket=BUCKET_URI,\n", + " experiment=EXPERIMENT_NAME,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5303c05f7aa6" + }, + "source": [ + "## Import libraries\n", + "\n", + "Import tutorial libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6fc324893334" + }, + "outputs": [], + "source": [ + "# General\n", + "import random\n", + "import string\n", + "\n", + "from IPython.display import HTML, Markdown, display\n", + "\n", + "# Build agent\n", + "from crewai import Agent, Crew, Process, Task\n", + "from crewai.flow.flow import Flow, listen, start\n", + "from crewai_tools import tool\n", + "\n", + "# Evaluate agent\n", + "from google.cloud import aiplatform\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "from vertexai.preview import reasoning_engines\n", + "from vertexai.preview.evaluation import EvalTask\n", + "from vertexai.preview.evaluation.metrics import (\n", + " PointwiseMetric,\n", + " PointwiseMetricPromptTemplate,\n", + " TrajectorySingleToolUse,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MVnBDX54gz7j" + }, + "source": [ + "## Define helper functions\n", + "\n", + "Initiate a set of helper functions to print tutorial results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uSgWjMD_g1_v" + }, + "outputs": [], + "source": [ + "def get_id(length: int = 8) -> str:\n", + " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", + " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", + "\n", + "\n", + "def parse_crewai_output_to_dictionary(crew, crew_output):\n", + " \"\"\"\n", + " Parse CrewAI output into a structured dictionary format.\n", + " \"\"\"\n", + " final_output = {\"response\": str(crew_output), \"predicted_trajectory\": []}\n", + "\n", + " try:\n", + " # Access tools_results directly from each agent\n", + " for agent in crew.agents:\n", + " if hasattr(agent, \"tools_results\"):\n", + " for tool_result in agent.tools_results:\n", + " tool_info = {\n", + " \"tool_name\": tool_result.get(\"tool_name\", \"\"),\n", + " \"tool_input\": tool_result.get(\"tool_args\", {}),\n", + " }\n", + " final_output[\"predicted_trajectory\"].append(tool_info)\n", + "\n", + " except Exception as e:\n", + " final_output[\"error\"] = f\"Error parsing tools results: {str(e)}\"\n", + "\n", + " return final_output\n", + "\n", + "\n", + "def format_output_as_markdown(output: dict) -> str:\n", + " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", + " markdown = \"### AI Response\\n\"\n", + " markdown += f\"{output['response']}\\n\\n\"\n", + "\n", + " if output[\"predicted_trajectory\"]:\n", + " markdown += \"### Function Calls\\n\"\n", + " for call in output[\"predicted_trajectory\"]:\n", + " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", + " markdown += \" - **Arguments**:\\n\"\n", + " for key, value in call[\"tool_input\"].items():\n", + " markdown += f\" - `{key}`: `{value}`\\n\"\n", + "\n", + " return markdown\n", + "\n", + "\n", + "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", + " \"\"\"Display the evaluation results.\"\"\"\n", + " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", + " display(Markdown(\"### Summary Metrics\"))\n", + " display(metrics_df)\n", + "\n", + " display(Markdown(f\"### Row-wise Metrics\"))\n", + " display(eval_result.metrics_table)\n", + "\n", + "\n", + "def display_drilldown(row: pd.Series) -> None:\n", + " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", + "\n", + " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", + "\n", + " if not (\n", + " isinstance(row[\"predicted_trajectory\"], list)\n", + " and isinstance(row[\"reference_trajectory\"], list)\n", + " ):\n", + " return\n", + "\n", + " for predicted_trajectory, reference_trajectory in zip(\n", + " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", + " ):\n", + " display(\n", + " HTML(\n", + " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", + " )\n", + " )\n", + "\n", + " if not (\n", + " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", + " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", + " ):\n", + " continue\n", + "\n", + " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", + " print(\"Tool Input Key: \", tool_input_key)\n", + "\n", + " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " reference_trajectory[\"tool_input\"][tool_input_key],\n", + " )\n", + " else:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " \"N/A\",\n", + " )\n", + " print(\"\\n\")\n", + " display(HTML(\"
\"))\n", + "\n", + "\n", + "def display_dataframe_rows(\n", + " df: pd.DataFrame,\n", + " columns: list[str] | None = None,\n", + " num_rows: int = 3,\n", + " display_drilldown: bool = False,\n", + ") -> None:\n", + " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", + "\n", + " if columns:\n", + " df = df[columns]\n", + "\n", + " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", + " header_style = base_style + \"font-weight: bold;\"\n", + "\n", + " for _, row in df.head(num_rows).iterrows():\n", + " for column in df.columns:\n", + " display(\n", + " HTML(\n", + " f\"{column.replace('_', ' ').title()}: \"\n", + " )\n", + " )\n", + " display(HTML(f\"{row[column]}
\"))\n", + "\n", + " display(HTML(\"
\"))\n", + "\n", + " if (\n", + " display_drilldown\n", + " and \"predicted_trajectory\" in df.columns\n", + " and \"reference_trajectory\" in df.columns\n", + " ):\n", + " display_drilldown(row)\n", + "\n", + "\n", + "def plot_bar_plot(\n", + " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", + ") -> None:\n", + " fig = go.Figure()\n", + " data = []\n", + "\n", + " summary_metrics = eval_result.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " data.append(\n", + " go.Bar(\n", + " x=list(summary_metrics.keys()),\n", + " y=list(summary_metrics.values()),\n", + " name=title,\n", + " )\n", + " )\n", + "\n", + " fig = go.Figure(data=data)\n", + "\n", + " # Change the bar mode\n", + " fig.update_layout(barmode=\"group\")\n", + " fig.show()\n", + "\n", + "\n", + "def display_radar_plot(eval_results, title: str, metrics=None):\n", + " \"\"\"Plot the radar plot.\"\"\"\n", + " fig = go.Figure()\n", + " summary_metrics = eval_results.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " min_val = min(summary_metrics.values())\n", + " max_val = max(summary_metrics.values())\n", + "\n", + " fig.add_trace(\n", + " go.Scatterpolar(\n", + " r=list(summary_metrics.values()),\n", + " theta=list(summary_metrics.keys()),\n", + " fill=\"toself\",\n", + " name=title,\n", + " )\n", + " )\n", + " fig.update_layout(\n", + " title=title,\n", + " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", + " showlegend=True,\n", + " )\n", + " fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bDaa2Mtsifmq" + }, + "source": [ + "## Build an agent using Vertex AI Reasoning Engine's customized template\n", + "\n", + "Build your application using CrewAI, including the Gemini model and custom tools that you define.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KHwShhpOitKp" + }, + "source": [ + "### Set tools\n", + "\n", + "To start, set the tools that a customer support agent needs to do their job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gA2ZKvfeislw" + }, + "outputs": [], + "source": [ + "@tool\n", + "def get_product_details(product_name: str):\n", + " \"\"\"Gathers basic details about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", + " \"usb charger\": \"A super fast and light usb charger\",\n", + " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", + " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", + " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " }\n", + " return details.get(product_name, \"Product details not found.\")\n", + "\n", + "\n", + "@tool\n", + "def get_product_price(product_name: str):\n", + " \"\"\"Gathers price about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": 500,\n", + " \"usb charger\": 10,\n", + " \"shoes\": 100,\n", + " \"headphones\": 50,\n", + " \"speaker\": 80,\n", + " }\n", + " return details.get(product_name, \"Product price not found.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "be70714d9fae" + }, + "source": [ + "### Define router using Flow\n", + "\n", + "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "516b5108d327" + }, + "outputs": [], + "source": [ + "class ProductFlow(Flow):\n", + " @start\n", + " def begin_flow(self):\n", + " \"\"\"Starts the product information flow\"\"\"\n", + " return \"check_request\"\n", + "\n", + " @listen(\"check_request\")\n", + " def router(self, state: dict) -> str:\n", + " \"\"\"Routes the product request to appropriate handler\"\"\"\n", + " # Get the last message from the state\n", + " last_message = state.get(\"last_message\", {})\n", + " tool_calls = last_message.get(\"tool_calls\", [])\n", + "\n", + " if tool_calls:\n", + " function_name = tool_calls[0].get(\"name\")\n", + " if function_name == \"get_product_price\":\n", + " return \"get_product_price\"\n", + " else:\n", + " return \"get_product_details\"\n", + " return \"end\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FHjhBVx2cHWb" + }, + "source": [ + "### Set the model\n", + "\n", + "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iCx9hbpccHWc" + }, + "outputs": [], + "source": [ + "model = \"vertex_ai/gemini-1.5-pro-002\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNlAY9cojEWz" + }, + "source": [ + "### Assemble the agent\n", + "\n", + "To create a CrewAI agent using [Vertex AI Reasoning Engine](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy), use the [customized template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize).\n", + "\n", + "This class helps you quickly get an agent using any framework running Vertex AI Reasoning Engine.\n", + "\n", + "To learn more about the template, check out [Customize an application template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize) documentation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dAFdi7SujGP8" + }, + "outputs": [], + "source": [ + "class CrewAIApp:\n", + " def __init__(self, project: str, location: str, model: str = model) -> None:\n", + " self.project_id = project\n", + " self.location = location\n", + " self.model = model\n", + "\n", + " # The set_up method is used to define application initialization logic\n", + " def set_up(self) -> None:\n", + " \"\"\"Set up the application.\"\"\"\n", + " os.environ[\"GOOGLE_CLOUD_PROJECT\"] = self.project_id\n", + " return\n", + "\n", + " # The query method will be used to send inputs to the agent\n", + " def query(self, input: str):\n", + " \"\"\"Query the application.\"\"\"\n", + " product_researcher = Agent(\n", + " role=\"Product Researcher\",\n", + " goal=\"Research product details and prices accurately\",\n", + " backstory=\"Expert at gathering and analyzing product information\",\n", + " llm=model,\n", + " tools=[get_product_details, get_product_price],\n", + " allow_delegation=False,\n", + " )\n", + "\n", + " research_task = Task(\n", + " description=f\"Analyze this user request: '{input}'. \"\n", + " f\"If the request is about price, use get_product_price tool. \"\n", + " f\"Otherwise, use get_product_details tool to get product information.\",\n", + " expected_output=\"Product information including details and/or price based on the user request.\",\n", + " agent=product_researcher,\n", + " )\n", + "\n", + " crew = Crew(\n", + " agents=[product_researcher],\n", + " tasks=[research_task],\n", + " process=Process.sequential,\n", + " )\n", + "\n", + " result = crew.kickoff()\n", + " return parse_crewai_output_to_dictionary(crew, result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_HGcs6PVjRj_" + }, + "source": [ + "### Test the local agent\n", + "\n", + "Query your agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1dXLLgBudu_L" + }, + "outputs": [], + "source": [ + "local_custom_agent = CrewAIApp(project=PROJECT_ID, location=LOCATION)\n", + "local_custom_agent.set_up()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PgkOhPmN3aCZ" + }, + "outputs": [], + "source": [ + "response = local_custom_agent.query(input=\"Get product details for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lGb58OJkjUs9" + }, + "outputs": [], + "source": [ + "response = local_custom_agent.query(input=\"Get product price for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2pSItXD5e4QD" + }, + "source": [ + "### Deploy the local agent to Vertex AI Reasoning Engine\n", + "\n", + "To deploy the local agent on Vertex AI Reasoning Engine, you can use the `create` method by passing the agent and some specify dependencies (`requirements` for external PyPI packages and `extra_packages` for local packages ).\n", + "\n", + "Look at [Deploy the application](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy#create_a_reasoningengine_instance) documentation page to learn more. \n", + "\n", + "> The agent deployment on Vertex AI Reasoning Engine would require ~ 10 mins." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3HLz_a1We4QE" + }, + "outputs": [], + "source": [ + "local_custom_agent = CrewAIApp(project=PROJECT_ID, location=LOCATION)\n", + "\n", + "remote_custom_agent = reasoning_engines.ReasoningEngine.create(\n", + " local_custom_agent,\n", + " requirements=[\n", + " \"google-cloud-aiplatform[reasoningengine]\",\n", + " \"crewai\",\n", + " \"crewai-tools\",\n", + " \"cloudpickle==3.0.0\",\n", + " \"pydantic==2.7.4\",\n", + " \"requests\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nu4RO1P9e4QE" + }, + "source": [ + "### Test the remote agent\n", + "\n", + "Query your remote agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sqBtzYyce4QE" + }, + "outputs": [], + "source": [ + "response = remote_custom_agent.query(input=\"Get product details for shoes\")\n", + "display(Markdown(format_output_as_markdown(response)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aOGPePsorpUl" + }, + "source": [ + "## Evaluating a custom agent with Vertex AI Gen AI Evaluation\n", + "\n", + "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", + "\n", + "Monitoring focuses on how well your agent is performing specific tasks:\n", + "\n", + "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", + "\n", + "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", + "\n", + "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", + "\n", + "Observability is about understanding the overall health of the agent:\n", + "\n", + "* **Latency**: How long does it take the agent to respond?\n", + "\n", + "* **Failure Rate**: How often does the agent fail to produce a response?\n", + "\n", + "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e43229f3ad4f" + }, + "source": [ + "### Prepare Agent Evaluation dataset\n", + "\n", + "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", + "\n", + "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", + "\n", + "\n", + "> Optionally, you can provide both generated responses and predicted trajectory (**bring-your-own-dataset scenario**).\n", + "\n", + "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fFf8uTdUiDt3" + }, + "outputs": [], + "source": [ + "eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + "}\n", + "\n", + "eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PQEI1EcfvFHb" + }, + "source": [ + "Print some samples from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EjsonqWWvIvE" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(eval_sample_dataset, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "htCrOS9fRVi8" + }, + "source": [ + "### Prepare an Agent function\n", + "\n", + "In this scenario with a custom agent, you need an agent function to pass the agent output to Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GdO56MIDRZri" + }, + "outputs": [], + "source": [ + "def agent_parsed_response(input: str) -> dict:\n", + " \"\"\"Pass the agent output to Vertex AI Gen AI Evaluation.\"\"\"\n", + "\n", + " result = remote_custom_agent.query(input=input)\n", + "\n", + " return result" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m4CvBuf1afHG" + }, + "source": [ + "### Single tool usage evaluation\n", + "\n", + "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_rS5GGKHd5bx" + }, + "source": [ + "#### Set single tool usage metrics\n", + "\n", + "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", + "\n", + "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xixvq8dwd5by" + }, + "outputs": [], + "source": [ + "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ktKZoT2Qd5by" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using a local agent (local_1p_agent) and assigns a unique identifier to this specific evaluation run, storing the evaluation results.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QaMf9dqzySE6" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SRv43fDcd5by" + }, + "outputs": [], + "source": [ + "single_tool_call_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=single_tool_usage_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", + " runnable=agent_parsed_response, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(single_tool_call_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6o5BjSTFKVMS" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Use some helper functions to visualize a sample of evaluation result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZkpwPReipekr" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JlujdJpu5Kn6" + }, + "source": [ + "### Trajectory Evaluation\n", + "\n", + "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8s-nHdDJneHM" + }, + "source": [ + "#### Set trajectory metrics\n", + "\n", + "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", + "\n", + "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", + "\n", + "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", + "\n", + "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", + "\n", + "* `trajectory_precision`: proportion of predicted actions present in reference\n", + "\n", + "* `trajectory_recall`: proportion of reference actions present in predicted. \n", + "\n", + "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c32WIS95neHN" + }, + "outputs": [], + "source": [ + "trajectory_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"trajectory_any_order_match\",\n", + " \"trajectory_precision\",\n", + " \"trajectory_recall\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF3jhTH3neHN" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vOdS7TJUneHN" + }, + "outputs": [], + "source": [ + "EXPERIMENT_NAME = \"evaluate-re-agent-trajectory\" # @param {type:\"string\"}\n", + "\n", + "trajectory_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=agent_parsed_response)\n", + "\n", + "display_eval_report(trajectory_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DBiUI3LyLBtj" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Print and visualize a sample of evaluation results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sLVRdN5llA0h" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PrxM5sMZYXHP" + }, + "outputs": [], + "source": [ + "plot_bar_plot(\n", + " trajectory_eval_result,\n", + " title=\"Trajectory Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T8TipU2akHEd" + }, + "source": [ + "### Evaluate final response\n", + "\n", + "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DeK-py7ykkDN" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", + "\n", + "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cyGHGgeVklvz" + }, + "outputs": [], + "source": [ + "response_metrics = [\"safety\", \"coherence\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DaBJWcg1kn55" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wRb2EC_hknSD" + }, + "outputs": [], + "source": [ + "EXPERIMENT_NAME = \"evaluate-re-agent-response\" # @param {type:\"string\"}\n", + "\n", + "response_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "response_eval_result = response_eval_task.evaluate(runnable=agent_parsed_response)\n", + "\n", + "display_eval_report(response_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JtewTwiwg9qH" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cy0aRydrp9zW" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ntRBK3Te6PEc" + }, + "source": [ + "### Evaluate generated response conditioned by tool choosing\n", + "\n", + "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", + "\n", + "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4bENwFcd6prX" + }, + "source": [ + "#### Define a custom metric\n", + "\n", + "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", + "\n", + "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "txGEHcg76riI" + }, + "outputs": [], + "source": [ + "criteria = {\n", + " \"Follows trajectory\": (\n", + " \"Evaluate whether the agent's response logically follows from the \"\n", + " \"sequence of actions it took. Consider these sub-points:\\n\"\n", + " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", + " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", + " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", + " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", + " )\n", + "}\n", + "\n", + "pointwise_rating_rubric = {\n", + " \"1\": \"Follows trajectory\",\n", + " \"0\": \"Does not follow trajectory\",\n", + "}\n", + "\n", + "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", + " criteria=criteria,\n", + " rating_rubric=pointwise_rating_rubric,\n", + " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8MJqXu0kikxd" + }, + "source": [ + "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5EL7iEDMikNQ" + }, + "outputs": [], + "source": [ + "print(response_follows_trajectory_prompt_template.prompt_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e1djVp7Fi4Yy" + }, + "source": [ + "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nx1xbZD87iMj" + }, + "outputs": [], + "source": [ + "response_follows_trajectory_metric = PointwiseMetric(\n", + " metric=\"response_follows_trajectory\",\n", + " metric_prompt_template=response_follows_trajectory_prompt_template,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1pmxLwTe7Ywv" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "Set new generated response evaluation metrics by including the custom metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wrsbVFDd7Ywv" + }, + "outputs": [], + "source": [ + "response_tool_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"safety\",\n", + " response_follows_trajectory_metric,\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lo-Sza807Ywv" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_dkb4gSn7Ywv" + }, + "outputs": [], + "source": [ + "EXPERIMENT_NAME = \"evaluate-re-agent-response-by-tools\" # @param {type:\"string\"}\n", + "\n", + "response_eval_tool_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "response_eval_tool_result = response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_response\n", + ")\n", + "\n", + "display_eval_report(response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EFmnRBlWqJnC" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZODTRuq2lF75" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4nuUDP3a2eTB" + }, + "source": [ + "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", + "\n", + "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pNh3-NDuZGDl" + }, + "source": [ + "### Bring your own evaluation dataset\n", + "\n", + "Define the evaluation dataset with the predicted trajectory and the generated response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y9hBgsg324Ej" + }, + "outputs": [], + "source": [ + "byod_eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"predicted_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"response\": [\n", + " 500,\n", + " 50,\n", + " \"A super fast and light usb charger\",\n", + " 100,\n", + " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " ],\n", + "}\n", + "\n", + "byod_eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oEYmU2eJ7q-1" + }, + "source": [ + "### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wBD-4wpB7q-3" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", + "\n", + "byod_response_eval_tool_task = EvalTask(\n", + " dataset=byod_eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", + " experiment_run_name=EXPERIMENT_RUN_NAME\n", + ")\n", + "\n", + "display_eval_report(byod_response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9eU3LG6r7q-3" + }, + "source": [ + "### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pQFzmd2I7q-3" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0FEbvEOkZS8f" + }, + "outputs": [], + "source": [ + "display_radar_plot(\n", + " byod_response_eval_tool_result,\n", + " title=\"Agent evaluation metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2a4e033321ad" + }, + "source": [ + "## Cleaning up\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ox2I3UfRlTOd" + }, + "outputs": [], + "source": [ + "delete_experiment = True\n", + "delete_remote_agent = True\n", + "\n", + "if delete_experiment:\n", + " try:\n", + " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", + " experiment.delete(delete_backing_tensorboard_runs=True)\n", + " except Exception as e:\n", + " print(e)\n", + "\n", + "if delete_remote_agent:\n", + " try:\n", + " remote_custom_agent.delete()\n", + " except Exception as e:\n", + " print(e)" + ] + } + ], + "metadata": { + "colab": { + "name": "evaluating_crewai_agent_reasoning_engine_customized_template.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb b/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb index 3147b5ea23..f129ee14c3 100644 --- a/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb +++ b/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb @@ -1,1545 +1,1528 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ur8xi4C7S06n" - }, - "outputs": [], - "source": [ - "# Copyright 2024 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JAPoU8Sm5E6e" - }, - "source": [ - "# Evaluating a LangChain Agent on Vertex AI Reasoning Engine (Prebuilt template)\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \"Google
Open in Colab\n", - "
\n", - "
\n", - " \n", - " \"Google
Open in Colab Enterprise\n", - "
\n", - "
\n", - " \n", - " \"Vertex
Open in Vertex AI Workbench\n", - "
\n", - "
\n", - " \n", - " \"GitHub
View on GitHub\n", - "
\n", - "
\n", - "\n", - "
\n", - "\n", - "Share to:\n", - "\n", - "\n", - " \"LinkedIn\n", - "\n", - "\n", - "\n", - " \"Bluesky\n", - "\n", - "\n", - "\n", - " \"X\n", - "\n", - "\n", - "\n", - " \"Reddit\n", - "\n", - "\n", - "\n", - " \"Facebook\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "84f0f73a0f76" - }, - "source": [ - "| | |\n", - "|-|-|\n", - "| Authors | [Naveksha Sood](https://github.com/navekshasood) [Ivan Nardini](https://github.com/inardini) |" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tvgnzT1CKxrO" - }, - "source": [ - "## Overview\n", - "\n", - "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", - "\n", - "This tutorial shows how to evaluate a first-party Reasoning Engine Agent using Vertex AI Gen AI Evaluation for agent evaluation.\n", - "\n", - "The tutorial uses the following Google Cloud services and resources:\n", - "\n", - "* Vertex AI Gen AI Evaluation\n", - "* Vertex AI Reasoning Engine\n", - "\n", - "The steps performed include:\n", - "\n", - "* Build and deploy an agent using LangChain\n", - "* Prepare Agent Evaluation dataset\n", - "* Single tool usage evaluation\n", - "* Trajectory evaluation\n", - "* Response evaluation" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "61RBz8LLbxCR" - }, - "source": [ - "## Get started" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "No17Cw5hgx12" - }, - "source": [ - "### Install Vertex AI SDK and other required packages\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tFy3H3aPgx12" - }, - "outputs": [], - "source": [ - "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation, langchain, reasoningengine]\" \\\n", - " \"langchain_google_vertexai\" \\\n", - " \"cloudpickle==3.0.0\" \\\n", - " \"pydantic==2.7.4\" \\\n", - " \"requests\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "R5Xep4W9lq-Z" - }, - "source": [ - "### Restart runtime\n", - "\n", - "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", - "\n", - "The restart might take a minute or longer. After it's restarted, continue to the next step." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XRvKdaPDTznN" - }, - "outputs": [], - "source": [ - "import IPython\n", - "\n", - "app = IPython.Application.instance()\n", - "app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SbmM4z7FOBpM" - }, - "source": [ - "
\n", - "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", - "
\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dmWOrTJ3gx13" - }, - "source": [ - "### Authenticate your notebook environment (Colab only)\n", - "\n", - "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NyKGtVQjgx13" - }, - "outputs": [], - "source": [ - "import sys\n", - "\n", - "if \"google.colab\" in sys.modules:\n", - " from google.colab import auth\n", - "\n", - " auth.authenticate_user()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF4l8DTdWgPY" - }, - "source": [ - "### Set Google Cloud project information and initialize Vertex AI SDK\n", - "\n", - "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", - "\n", - "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nqwi-5ufWp_B" - }, - "outputs": [], - "source": [ - "# Use the environment variable if the user doesn't provide Project ID.\n", - "import os\n", - "\n", - "import vertexai\n", - "\n", - "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", - "\n", - "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", - " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", - "\n", - "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", - "\n", - "BUCKET_NAME = \"[your-bucket-name]\" # @param {type: \"string\", placeholder: \"[your-bucket-name]\", isTemplate: true}\n", - "\n", - "if not BUCKET_NAME or BUCKET_NAME == \"[your-bucket-name]\":\n", - " BUCKET_NAME = f\"{PROJECT_ID}-bucket\"\n", - "\n", - "! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI\n", - "\n", - "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", - "\n", - "vertexai.init(\n", - " project=PROJECT_ID,\n", - " location=LOCATION,\n", - " staging_bucket=BUCKET_URI,\n", - " experiment=EXPERIMENT_NAME,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5303c05f7aa6" - }, - "source": [ - "## Import libraries\n", - "\n", - "Import tutorial libraries." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6fc324893334" - }, - "outputs": [], - "source": [ - "# General\n", - "import random\n", - "import string\n", - "\n", - "from IPython.display import HTML, Markdown, display\n", - "\n", - "# Build agent\n", - "from google.cloud import aiplatform\n", - "import pandas as pd\n", - "import plotly.graph_objects as go\n", - "from vertexai.preview import reasoning_engines\n", - "\n", - "# Evaluate agent\n", - "from vertexai.preview.evaluation import EvalTask\n", - "from vertexai.preview.evaluation.metrics import (\n", - " PointwiseMetric,\n", - " PointwiseMetricPromptTemplate,\n", - " TrajectorySingleToolUse,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MVnBDX54gz7j" - }, - "source": [ - "## Define helper functions\n", - "\n", - "Initiate a set of helper functions to print tutorial results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uSgWjMD_g1_v" - }, - "outputs": [], - "source": [ - "def get_id(length: int = 8) -> str:\n", - " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", - " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", - "\n", - "\n", - "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", - " \"\"\"Display the evaluation results.\"\"\"\n", - " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", - " display(Markdown(\"### Summary Metrics\"))\n", - " display(metrics_df)\n", - "\n", - " display(Markdown(f\"### Row-wise Metrics\"))\n", - " display(eval_result.metrics_table)\n", - "\n", - "\n", - "def display_drilldown(row: pd.Series) -> None:\n", - " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", - "\n", - " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", - "\n", - " if not (\n", - " isinstance(row[\"predicted_trajectory\"], list)\n", - " and isinstance(row[\"reference_trajectory\"], list)\n", - " ):\n", - " return\n", - "\n", - " for predicted_trajectory, reference_trajectory in zip(\n", - " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", - " ):\n", - " display(\n", - " HTML(\n", - " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", - " )\n", - " )\n", - "\n", - " if not (\n", - " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", - " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", - " ):\n", - " continue\n", - "\n", - " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", - " print(\"Tool Input Key: \", tool_input_key)\n", - "\n", - " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", - " print(\n", - " \"Tool Values: \",\n", - " predicted_trajectory[\"tool_input\"][tool_input_key],\n", - " reference_trajectory[\"tool_input\"][tool_input_key],\n", - " )\n", - " else:\n", - " print(\n", - " \"Tool Values: \",\n", - " predicted_trajectory[\"tool_input\"][tool_input_key],\n", - " \"N/A\",\n", - " )\n", - " print(\"\\n\")\n", - " display(HTML(\"
\"))\n", - "\n", - "\n", - "def display_dataframe_rows(\n", - " df: pd.DataFrame,\n", - " columns: list[str] | None = None,\n", - " num_rows: int = 3,\n", - " display_drilldown: bool = False,\n", - ") -> None:\n", - " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", - "\n", - " if columns:\n", - " df = df[columns]\n", - "\n", - " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", - " header_style = base_style + \"font-weight: bold;\"\n", - "\n", - " for _, row in df.head(num_rows).iterrows():\n", - " for column in df.columns:\n", - " display(\n", - " HTML(\n", - " f\"{column.replace('_', ' ').title()}: \"\n", - " )\n", - " )\n", - " display(HTML(f\"{row[column]}
\"))\n", - "\n", - " display(HTML(\"
\"))\n", - "\n", - " if (\n", - " display_drilldown\n", - " and \"predicted_trajectory\" in df.columns\n", - " and \"reference_trajectory\" in df.columns\n", - " ):\n", - " display_drilldown(row)\n", - "\n", - "\n", - "def plot_bar_plot(\n", - " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", - ") -> None:\n", - " fig = go.Figure()\n", - " data = []\n", - "\n", - " summary_metrics = eval_result.summary_metrics\n", - " if metrics:\n", - " summary_metrics = {\n", - " k: summary_metrics[k]\n", - " for k, v in summary_metrics.items()\n", - " if any(selected_metric in k for selected_metric in metrics)\n", - " }\n", - "\n", - " data.append(\n", - " go.Bar(\n", - " x=list(summary_metrics.keys()),\n", - " y=list(summary_metrics.values()),\n", - " name=title,\n", - " )\n", - " )\n", - "\n", - " fig = go.Figure(data=data)\n", - "\n", - " # Change the bar mode\n", - " fig.update_layout(barmode=\"group\")\n", - " fig.show()\n", - "\n", - "\n", - "def display_radar_plot(eval_results, title: str, metrics=None):\n", - " \"\"\"Plot the radar plot.\"\"\"\n", - " fig = go.Figure()\n", - " summary_metrics = eval_results.summary_metrics\n", - " if metrics:\n", - " summary_metrics = {\n", - " k: summary_metrics[k]\n", - " for k, v in summary_metrics.items()\n", - " if any(selected_metric in k for selected_metric in metrics)\n", - " }\n", - "\n", - " min_val = min(summary_metrics.values())\n", - " max_val = max(summary_metrics.values())\n", - "\n", - " fig.add_trace(\n", - " go.Scatterpolar(\n", - " r=list(summary_metrics.values()),\n", - " theta=list(summary_metrics.keys()),\n", - " fill=\"toself\",\n", - " name=title,\n", - " )\n", - " )\n", - " fig.update_layout(\n", - " title=title,\n", - " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", - " showlegend=True,\n", - " )\n", - " fig.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bDaa2Mtsifmq" - }, - "source": [ - "## Build and deploy a LangChain agent using Vertex AI Reasoning Engine's prebuilt template\n", - "\n", - "Build and deploy your application using LangChain, including the Gemini model and custom tools that you define.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KHwShhpOitKp" - }, - "source": [ - "### Set tools\n", - "\n", - "To start, create some tools the agent will need to do their job. We are just going to pretend there's a database for this Colab, but you would wire into your database or third party system for a real agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gA2ZKvfeislw" - }, - "outputs": [], - "source": [ - "def get_product_details(product_name: str):\n", - " \"\"\"Gathers basic details about a product.\"\"\"\n", - " details = {\n", - " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", - " \"usb charger\": \"A super fast and light usb charger\",\n", - " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", - " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", - " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", - " }\n", - " return details.get(product_name, \"Product details not found.\")\n", - "\n", - "\n", - "def get_product_price(product_name: str):\n", - " \"\"\"Gathers price about a product.\"\"\"\n", - " details = {\n", - " \"smartphone\": 500,\n", - " \"usb charger\": 10,\n", - " \"shoes\": 100,\n", - " \"headphones\": 50,\n", - " \"speaker\": 80,\n", - " }\n", - " return details.get(product_name, \"Product price not found.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "l4mk5XPui4Y1" - }, - "source": [ - "### Set the model\n", - "\n", - "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "BaYeo6K2i-w1" - }, - "outputs": [], - "source": [ - "model = \"gemini-1.5-pro\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tNlAY9cojEWz" - }, - "source": [ - "### Assemble the agent\n", - "\n", - "To create a LangChain agent using [Vertex AI Reasoning Engine](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy), use the LangchainAgent class. This class helps you quickly get an agent running with a standard template. Think of it as a shortcut for building agents – you don't have to start from scratch. The LangchainAgent handles the basic structure and initial configuration, allowing you to get right into using the agent.\n", - "\n", - "> Note the additional parameter `agent_executor_kwargs` which would allow to return tool calls made by agent so you can evaluate them.\n", - "\n", - "The Vertex AI Gen AI Evaluation works directly with 'Queryable' agents (like in this case), and also lets you add your own custom functions with a specific structure (signature)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dAFdi7SujGP8" - }, - "outputs": [], - "source": [ - "local_1p_agent = reasoning_engines.LangchainAgent(\n", - " model=model,\n", - " tools=[get_product_details, get_product_price],\n", - " agent_executor_kwargs={\"return_intermediate_steps\": True},\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_HGcs6PVjRj_" - }, - "source": [ - "### Test the local agent\n", - "\n", - "Query your agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "lGb58OJkjUs9" - }, - "outputs": [], - "source": [ - "response = local_1p_agent.query(input=\"Get product details for shoes\")\n", - "display(Markdown(response[\"output\"]))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "INqf60zPWP6L" - }, - "outputs": [], - "source": [ - "response = local_1p_agent.query(input=\"Get product price for shoes\")\n", - "display(Markdown(response[\"output\"]))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dP5g16W1rzMI" - }, - "source": [ - "### Deploy the local agent to Vertex AI Reasoning Engine\n", - "\n", - "To deploy the local agent on Vertex AI Reasoning Engine, you can use the `create` method by passing the agent and some specify dependencies (`requirements` for external PyPI packages and `extra_packages` for local packages ).\n", - "\n", - "Look at [Deploy the application](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy#create_a_reasoningengine_instance) documentation page to learn more. \n", - "\n", - "> The agent deployment on Vertex AI Reasoning Engine would require ~ 10 mins." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GPNpD676r6T2" - }, - "outputs": [], - "source": [ - "remote_1p_agent = reasoning_engines.ReasoningEngine.create(\n", - " local_1p_agent,\n", - " requirements=[\n", - " \"google-cloud-aiplatform[langchain,reasoningengine]\",\n", - " \"langchain_google_vertexai\",\n", - " \"cloudpickle==3.0.0\",\n", - " \"pydantic==2.7.4\",\n", - " \"requests\",\n", - " ],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GjZMd82vHRh3" - }, - "source": [ - "### Test the remote agent\n", - "\n", - "Query your remote agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "KSCznbhbHRh3" - }, - "outputs": [], - "source": [ - "response = remote_1p_agent.query(input=\"Get product details for shoes\")\n", - "display(Markdown(response[\"output\"]))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aOGPePsorpUl" - }, - "source": [ - "## Evaluating an agent with Vertex AI Gen AI Evaluation\n", - "\n", - "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", - "\n", - "Monitoring focuses on how well your agent is performing specific tasks:\n", - "\n", - "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", - "\n", - "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", - "\n", - "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", - "\n", - "Observability is about understanding the overall health of the agent:\n", - "\n", - "* **Latency**: How long does it take the agent to respond?\n", - "\n", - "* **Failure Rate**: How often does the agent fail to produce a response?\n", - "\n", - "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e43229f3ad4f" - }, - "source": [ - "### Prepare Agent Evaluation dataset\n", - "\n", - "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", - "\n", - "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", - "\n", - "> Optionally, you can provide both generated responses and predicted trajectory (**bring-your-own-dataset scenario**).\n", - "\n", - "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fFf8uTdUiDt3" - }, - "outputs": [], - "source": [ - "eval_data = {\n", - " \"prompt\": [\n", - " \"Get price for smartphone\",\n", - " \"Get product details and price for headphones\",\n", - " \"Get details for usb charger\",\n", - " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\",\n", - " ],\n", - " \"reference_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - "}\n", - "\n", - "eval_sample_dataset = pd.DataFrame(eval_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PQEI1EcfvFHb" - }, - "source": [ - "Print some samples from the dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "EjsonqWWvIvE" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(eval_sample_dataset, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "m4CvBuf1afHG" - }, - "source": [ - "### Single tool usage evaluation\n", - "\n", - "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_rS5GGKHd5bx" - }, - "source": [ - "#### Set single tool usage metrics\n", - "\n", - "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", - "\n", - "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xixvq8dwd5by" - }, - "outputs": [], - "source": [ - "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ktKZoT2Qd5by" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using the remote agent and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QaMf9dqzySE6" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SRv43fDcd5by" - }, - "outputs": [], - "source": [ - "single_tool_call_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=single_tool_usage_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", - " runnable=remote_1p_agent, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(single_tool_call_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6o5BjSTFKVMS" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Use some helper functions to visualize a sample of evaluation result." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1Jopzw83k14w" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JlujdJpu5Kn6" - }, - "source": [ - "### Trajectory Evaluation\n", - "\n", - "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8s-nHdDJneHM" - }, - "source": [ - "#### Set trajectory metrics\n", - "\n", - "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", - "\n", - "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", - "\n", - "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", - "\n", - "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", - "\n", - "* `trajectory_precision`: proportion of predicted actions present in reference\n", - "\n", - "* `trajectory_recall`: proportion of reference actions present in predicted. \n", - "\n", - "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "c32WIS95neHN" - }, - "outputs": [], - "source": [ - "trajectory_metrics = [\n", - " \"trajectory_exact_match\",\n", - " \"trajectory_in_order_match\",\n", - " \"trajectory_any_order_match\",\n", - " \"trajectory_precision\",\n", - " \"trajectory_recall\",\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF3jhTH3neHN" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vOdS7TJUneHN" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"trajectory-{get_id()}\"\n", - "\n", - "trajectory_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", - ")\n", - "\n", - "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=remote_1p_agent)\n", - "\n", - "display_eval_report(trajectory_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DBiUI3LyLBtj" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Print and visualize a sample of evaluation results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sLVRdN5llA0h" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PrxM5sMZYXHP" - }, - "outputs": [], - "source": [ - "plot_bar_plot(\n", - " trajectory_eval_result,\n", - " title=\"Trajectory Metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "T8TipU2akHEd" - }, - "source": [ - "### Evaluate final response\n", - "\n", - "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DeK-py7ykkDN" - }, - "source": [ - "#### Set response metrics\n", - "\n", - "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", - "\n", - "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cyGHGgeVklvz" - }, - "outputs": [], - "source": [ - "response_metrics = [\"safety\", \"coherence\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DaBJWcg1kn55" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wRb2EC_hknSD" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"response-{get_id()}\"\n", - "\n", - "response_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", - ")\n", - "\n", - "response_eval_result = response_eval_task.evaluate(runnable=remote_1p_agent)\n", - "\n", - "display_eval_report(response_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JtewTwiwg9qH" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "\n", - "Print new evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZODTRuq2lF75" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ntRBK3Te6PEc" - }, - "source": [ - "### Evaluate generated response conditioned by tool choosing.\n", - "\n", - "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", - "\n", - "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4bENwFcd6prX" - }, - "source": [ - "#### Define a custom metric\n", - "\n", - "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", - "\n", - "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "txGEHcg76riI" - }, - "outputs": [], - "source": [ - "criteria = {\n", - " \"Follows trajectory\": (\n", - " \"Evaluate whether the agent's response logically follows from the \"\n", - " \"sequence of actions it took. Consider these sub-points:\\n\"\n", - " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", - " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", - " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", - " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", - " )\n", - "}\n", - "\n", - "pointwise_rating_rubric = {\n", - " \"1\": \"Follows trajectory\",\n", - " \"0\": \"Does not follow trajectory\",\n", - "}\n", - "\n", - "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", - " criteria=criteria,\n", - " rating_rubric=pointwise_rating_rubric,\n", - " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8MJqXu0kikxd" - }, - "source": [ - "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5EL7iEDMikNQ" - }, - "outputs": [], - "source": [ - "print(response_follows_trajectory_prompt_template.prompt_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e1djVp7Fi4Yy" - }, - "source": [ - "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nx1xbZD87iMj" - }, - "outputs": [], - "source": [ - "response_follows_trajectory_metric = PointwiseMetric(\n", - " metric=\"response_follows_trajectory\",\n", - " metric_prompt_template=response_follows_trajectory_prompt_template,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1pmxLwTe7Ywv" - }, - "source": [ - "#### Set response metrics\n", - "\n", - "Set new generated response evaluation metrics by including the custom metric.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wrsbVFDd7Ywv" - }, - "outputs": [], - "source": [ - "response_tool_metrics = [\n", - " \"trajectory_exact_match\",\n", - " \"trajectory_in_order_match\",\n", - " \"safety\",\n", - " response_follows_trajectory_metric,\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Lo-Sza807Ywv" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "Run a new agent's evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_dkb4gSn7Ywv" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"response-over-tools-{get_id()}\"\n", - "\n", - "response_eval_tool_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "response_eval_tool_result = response_eval_tool_task.evaluate(runnable=remote_1p_agent)\n", - "\n", - "display_eval_report(response_eval_tool_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AtOfIFi2j88g" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Visualize evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GH2YvXgLlLH7" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(response_eval_tool_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4nuUDP3a2eTB" - }, - "source": [ - "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", - "\n", - "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pNh3-NDuZGDl" - }, - "source": [ - "### Bring your own evaluation dataset\n", - "\n", - "Define the evaluation dataset with the predicted trajectory and the generated response." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "y9hBgsg324Ej" - }, - "outputs": [], - "source": [ - "byod_eval_data = {\n", - " \"prompt\": [\n", - " \"Get price for smartphone\",\n", - " \"Get product details and price for headphones\",\n", - " \"Get details for usb charger\",\n", - " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\",\n", - " ],\n", - " \"reference_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - " \"predicted_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - " \"response\": [\n", - " 500,\n", - " 50,\n", - " \"A super fast and light usb charger\",\n", - " 100,\n", - " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", - " ],\n", - "}\n", - "\n", - "byod_eval_sample_dataset = pd.DataFrame(eval_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oEYmU2eJ7q-1" - }, - "source": [ - "### Run an evaluation task\n", - "\n", - "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wBD-4wpB7q-3" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", - "\n", - "byod_response_eval_tool_task = EvalTask(\n", - " dataset=byod_eval_sample_dataset,\n", - " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(experiment_run_name=EXPERIMENT_RUN_NAME)\n", - "\n", - "display_eval_report(byod_response_eval_tool_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9eU3LG6r7q-3" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Visualize evaluation result sample.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pQFzmd2I7q-3" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0FEbvEOkZS8f" - }, - "outputs": [], - "source": [ - "display_radar_plot(\n", - " byod_response_eval_tool_result,\n", - " title=\"Agent evaluation metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2a4e033321ad" - }, - "source": [ - "## Cleaning up\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Ox2I3UfRlTOd" - }, - "outputs": [], - "source": [ - "delete_experiment = True\n", - "delete_remote_agent = True\n", - "\n", - "if delete_experiment:\n", - " try:\n", - " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", - " experiment.delete(delete_backing_tensorboard_runs=True)\n", - " except Exception as e:\n", - " print(e)\n", - "\n", - "if delete_remote_agent:\n", - " try:\n", - " remote_1p_agent.delete()\n", - " except Exception as e:\n", - " print(e)" - ] - } - ], - "metadata": { - "colab": { - "name": "evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb", - "toc_visible": true - }, - "environment": { - "kernel": "python3", - "name": "common-cpu.m126", - "type": "gcloud", - "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/base-cpu:m126" - }, - "kernelspec": { - "display_name": "Python 3 (Local)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 4 + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Evaluating a LangChain Agent on Vertex AI Reasoning Engine (Prebuilt template)\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Google
Open in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Open in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "Share to:\n", + "\n", + "\n", + " \"LinkedIn\n", + "\n", + "\n", + "\n", + " \"Bluesky\n", + "\n", + "\n", + "\n", + " \"X\n", + "\n", + "\n", + "\n", + " \"Reddit\n", + "\n", + "\n", + "\n", + " \"Facebook\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "84f0f73a0f76" + }, + "source": [ + "| | |\n", + "|-|-|\n", + "| Authors | [Naveksha Sood](https://github.com/navekshasood) [Ivan Nardini](https://github.com/inardini) |" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", + "\n", + "This tutorial shows how to evaluate a first-party Reasoning Engine Agent using Vertex AI Gen AI Evaluation for agent evaluation.\n", + "\n", + "The tutorial uses the following Google Cloud services and resources:\n", + "\n", + "* Vertex AI Gen AI Evaluation\n", + "* Vertex AI Reasoning Engine\n", + "\n", + "The steps performed include:\n", + "\n", + "* Build and deploy an agent using LangChain\n", + "* Prepare Agent Evaluation dataset\n", + "* Single tool usage evaluation\n", + "* Trajectory evaluation\n", + "* Response evaluation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "61RBz8LLbxCR" + }, + "source": [ + "## Get started" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "No17Cw5hgx12" + }, + "source": [ + "### Install Vertex AI SDK and other required packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tFy3H3aPgx12" + }, + "outputs": [], + "source": [ + "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation, langchain, reasoningengine]\" \\\n", + " \"langchain_google_vertexai\" \\\n", + " \"cloudpickle==3.0.0\" \\\n", + " \"pydantic==2.7.4\" \\\n", + " \"requests\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R5Xep4W9lq-Z" + }, + "source": [ + "### Restart runtime\n", + "\n", + "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", + "\n", + "The restart might take a minute or longer. After it's restarted, continue to the next step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XRvKdaPDTznN" + }, + "outputs": [], + "source": [ + "import IPython\n", + "\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SbmM4z7FOBpM" + }, + "source": [ + "
\n", + "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dmWOrTJ3gx13" + }, + "source": [ + "### Authenticate your notebook environment (Colab only)\n", + "\n", + "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NyKGtVQjgx13" + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " from google.colab import auth\n", + "\n", + " auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF4l8DTdWgPY" + }, + "source": [ + "### Set Google Cloud project information and initialize Vertex AI SDK\n", + "\n", + "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nqwi-5ufWp_B" + }, + "outputs": [], + "source": [ + "# Use the environment variable if the user doesn't provide Project ID.\n", + "import os\n", + "\n", + "import vertexai\n", + "\n", + "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", + "\n", + "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", + " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", + "\n", + "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", + "\n", + "BUCKET_NAME = \"[your-bucket-name]\" # @param {type: \"string\", placeholder: \"[your-bucket-name]\", isTemplate: true}\n", + "\n", + "if not BUCKET_NAME or BUCKET_NAME == \"[your-bucket-name]\":\n", + " BUCKET_NAME = f\"{PROJECT_ID}-bucket\"\n", + "\n", + "! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI\n", + "\n", + "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", + "\n", + "vertexai.init(\n", + " project=PROJECT_ID,\n", + " location=LOCATION,\n", + " staging_bucket=BUCKET_URI,\n", + " experiment=EXPERIMENT_NAME,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5303c05f7aa6" + }, + "source": [ + "## Import libraries\n", + "\n", + "Import tutorial libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6fc324893334" + }, + "outputs": [], + "source": [ + "# General\n", + "import random\n", + "import string\n", + "\n", + "from IPython.display import HTML, Markdown, display\n", + "\n", + "# Build agent\n", + "from google.cloud import aiplatform\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "from vertexai.preview import reasoning_engines\n", + "\n", + "# Evaluate agent\n", + "from vertexai.preview.evaluation import EvalTask\n", + "from vertexai.preview.evaluation.metrics import (\n", + " PointwiseMetric,\n", + " PointwiseMetricPromptTemplate,\n", + " TrajectorySingleToolUse,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MVnBDX54gz7j" + }, + "source": [ + "## Define helper functions\n", + "\n", + "Initiate a set of helper functions to print tutorial results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uSgWjMD_g1_v" + }, + "outputs": [], + "source": [ + "def get_id(length: int = 8) -> str:\n", + " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", + " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", + "\n", + "\n", + "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", + " \"\"\"Display the evaluation results.\"\"\"\n", + " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", + " display(Markdown(\"### Summary Metrics\"))\n", + " display(metrics_df)\n", + "\n", + " display(Markdown(f\"### Row-wise Metrics\"))\n", + " display(eval_result.metrics_table)\n", + "\n", + "\n", + "def display_drilldown(row: pd.Series) -> None:\n", + " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", + "\n", + " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", + "\n", + " if not (\n", + " isinstance(row[\"predicted_trajectory\"], list)\n", + " and isinstance(row[\"reference_trajectory\"], list)\n", + " ):\n", + " return\n", + "\n", + " for predicted_trajectory, reference_trajectory in zip(\n", + " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", + " ):\n", + " display(\n", + " HTML(\n", + " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", + " )\n", + " )\n", + "\n", + " if not (\n", + " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", + " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", + " ):\n", + " continue\n", + "\n", + " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", + " print(\"Tool Input Key: \", tool_input_key)\n", + "\n", + " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " reference_trajectory[\"tool_input\"][tool_input_key],\n", + " )\n", + " else:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " \"N/A\",\n", + " )\n", + " print(\"\\n\")\n", + " display(HTML(\"
\"))\n", + "\n", + "\n", + "def display_dataframe_rows(\n", + " df: pd.DataFrame,\n", + " columns: list[str] | None = None,\n", + " num_rows: int = 3,\n", + " display_drilldown: bool = False,\n", + ") -> None:\n", + " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", + "\n", + " if columns:\n", + " df = df[columns]\n", + "\n", + " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", + " header_style = base_style + \"font-weight: bold;\"\n", + "\n", + " for _, row in df.head(num_rows).iterrows():\n", + " for column in df.columns:\n", + " display(\n", + " HTML(\n", + " f\"{column.replace('_', ' ').title()}: \"\n", + " )\n", + " )\n", + " display(HTML(f\"{row[column]}
\"))\n", + "\n", + " display(HTML(\"
\"))\n", + "\n", + " if (\n", + " display_drilldown\n", + " and \"predicted_trajectory\" in df.columns\n", + " and \"reference_trajectory\" in df.columns\n", + " ):\n", + " display_drilldown(row)\n", + "\n", + "\n", + "def plot_bar_plot(\n", + " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", + ") -> None:\n", + " fig = go.Figure()\n", + " data = []\n", + "\n", + " summary_metrics = eval_result.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " data.append(\n", + " go.Bar(\n", + " x=list(summary_metrics.keys()),\n", + " y=list(summary_metrics.values()),\n", + " name=title,\n", + " )\n", + " )\n", + "\n", + " fig = go.Figure(data=data)\n", + "\n", + " # Change the bar mode\n", + " fig.update_layout(barmode=\"group\")\n", + " fig.show()\n", + "\n", + "\n", + "def display_radar_plot(eval_results, title: str, metrics=None):\n", + " \"\"\"Plot the radar plot.\"\"\"\n", + " fig = go.Figure()\n", + " summary_metrics = eval_results.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " min_val = min(summary_metrics.values())\n", + " max_val = max(summary_metrics.values())\n", + "\n", + " fig.add_trace(\n", + " go.Scatterpolar(\n", + " r=list(summary_metrics.values()),\n", + " theta=list(summary_metrics.keys()),\n", + " fill=\"toself\",\n", + " name=title,\n", + " )\n", + " )\n", + " fig.update_layout(\n", + " title=title,\n", + " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", + " showlegend=True,\n", + " )\n", + " fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bDaa2Mtsifmq" + }, + "source": [ + "## Build and deploy a LangChain agent using Vertex AI Reasoning Engine's prebuilt template\n", + "\n", + "Build and deploy your application using LangChain, including the Gemini model and custom tools that you define.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KHwShhpOitKp" + }, + "source": [ + "### Set tools\n", + "\n", + "To start, create some tools the agent will need to do their job. We are just going to pretend there's a database for this Colab, but you would wire into your database or third party system for a real agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gA2ZKvfeislw" + }, + "outputs": [], + "source": [ + "def get_product_details(product_name: str):\n", + " \"\"\"Gathers basic details about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", + " \"usb charger\": \"A super fast and light usb charger\",\n", + " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", + " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", + " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " }\n", + " return details.get(product_name, \"Product details not found.\")\n", + "\n", + "\n", + "def get_product_price(product_name: str):\n", + " \"\"\"Gathers price about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": 500,\n", + " \"usb charger\": 10,\n", + " \"shoes\": 100,\n", + " \"headphones\": 50,\n", + " \"speaker\": 80,\n", + " }\n", + " return details.get(product_name, \"Product price not found.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l4mk5XPui4Y1" + }, + "source": [ + "### Set the model\n", + "\n", + "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BaYeo6K2i-w1" + }, + "outputs": [], + "source": [ + "model = \"gemini-1.5-pro\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNlAY9cojEWz" + }, + "source": [ + "### Assemble the agent\n", + "\n", + "To create a LangChain agent using [Vertex AI Reasoning Engine](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy), use the LangchainAgent class. This class helps you quickly get an agent running with a standard template. Think of it as a shortcut for building agents – you don't have to start from scratch. The LangchainAgent handles the basic structure and initial configuration, allowing you to get right into using the agent.\n", + "\n", + "> Note the additional parameter `agent_executor_kwargs` which would allow to return tool calls made by agent so you can evaluate them.\n", + "\n", + "The Vertex AI Gen AI Evaluation works directly with 'Queryable' agents (like in this case), and also lets you add your own custom functions with a specific structure (signature)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dAFdi7SujGP8" + }, + "outputs": [], + "source": [ + "local_1p_agent = reasoning_engines.LangchainAgent(\n", + " model=model,\n", + " tools=[get_product_details, get_product_price],\n", + " agent_executor_kwargs={\"return_intermediate_steps\": True},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_HGcs6PVjRj_" + }, + "source": [ + "### Test the local agent\n", + "\n", + "Query your agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lGb58OJkjUs9" + }, + "outputs": [], + "source": [ + "response = local_1p_agent.query(input=\"Get product details for shoes\")\n", + "display(Markdown(response[\"output\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "INqf60zPWP6L" + }, + "outputs": [], + "source": [ + "response = local_1p_agent.query(input=\"Get product price for shoes\")\n", + "display(Markdown(response[\"output\"]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dP5g16W1rzMI" + }, + "source": [ + "### Deploy the local agent to Vertex AI Reasoning Engine\n", + "\n", + "To deploy the local agent on Vertex AI Reasoning Engine, you can use the `create` method by passing the agent and some specify dependencies (`requirements` for external PyPI packages and `extra_packages` for local packages ).\n", + "\n", + "Look at [Deploy the application](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy#create_a_reasoningengine_instance) documentation page to learn more. \n", + "\n", + "> The agent deployment on Vertex AI Reasoning Engine would require ~ 10 mins." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GPNpD676r6T2" + }, + "outputs": [], + "source": [ + "remote_1p_agent = reasoning_engines.ReasoningEngine.create(\n", + " local_1p_agent,\n", + " requirements=[\n", + " \"google-cloud-aiplatform[langchain,reasoningengine]\",\n", + " \"langchain_google_vertexai\",\n", + " \"cloudpickle==3.0.0\",\n", + " \"pydantic==2.7.4\",\n", + " \"requests\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GjZMd82vHRh3" + }, + "source": [ + "### Test the remote agent\n", + "\n", + "Query your remote agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KSCznbhbHRh3" + }, + "outputs": [], + "source": [ + "response = remote_1p_agent.query(input=\"Get product details for shoes\")\n", + "display(Markdown(response[\"output\"]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aOGPePsorpUl" + }, + "source": [ + "## Evaluating an agent with Vertex AI Gen AI Evaluation\n", + "\n", + "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", + "\n", + "Monitoring focuses on how well your agent is performing specific tasks:\n", + "\n", + "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", + "\n", + "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", + "\n", + "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", + "\n", + "Observability is about understanding the overall health of the agent:\n", + "\n", + "* **Latency**: How long does it take the agent to respond?\n", + "\n", + "* **Failure Rate**: How often does the agent fail to produce a response?\n", + "\n", + "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e43229f3ad4f" + }, + "source": [ + "### Prepare Agent Evaluation dataset\n", + "\n", + "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", + "\n", + "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", + "\n", + "> Optionally, you can provide both generated responses and predicted trajectory (**bring-your-own-dataset scenario**).\n", + "\n", + "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fFf8uTdUiDt3" + }, + "outputs": [], + "source": [ + "eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + "}\n", + "\n", + "eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PQEI1EcfvFHb" + }, + "source": [ + "Print some samples from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EjsonqWWvIvE" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(eval_sample_dataset, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m4CvBuf1afHG" + }, + "source": [ + "### Single tool usage evaluation\n", + "\n", + "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_rS5GGKHd5bx" + }, + "source": [ + "#### Set single tool usage metrics\n", + "\n", + "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", + "\n", + "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xixvq8dwd5by" + }, + "outputs": [], + "source": [ + "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ktKZoT2Qd5by" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using the remote agent and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QaMf9dqzySE6" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SRv43fDcd5by" + }, + "outputs": [], + "source": [ + "single_tool_call_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=single_tool_usage_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", + " runnable=remote_1p_agent, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(single_tool_call_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6o5BjSTFKVMS" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Use some helper functions to visualize a sample of evaluation result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1Jopzw83k14w" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JlujdJpu5Kn6" + }, + "source": [ + "### Trajectory Evaluation\n", + "\n", + "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8s-nHdDJneHM" + }, + "source": [ + "#### Set trajectory metrics\n", + "\n", + "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", + "\n", + "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", + "\n", + "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", + "\n", + "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", + "\n", + "* `trajectory_precision`: proportion of predicted actions present in reference\n", + "\n", + "* `trajectory_recall`: proportion of reference actions present in predicted. \n", + "\n", + "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c32WIS95neHN" + }, + "outputs": [], + "source": [ + "trajectory_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"trajectory_any_order_match\",\n", + " \"trajectory_precision\",\n", + " \"trajectory_recall\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF3jhTH3neHN" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vOdS7TJUneHN" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"trajectory-{get_id()}\"\n", + "\n", + "trajectory_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=remote_1p_agent)\n", + "\n", + "display_eval_report(trajectory_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DBiUI3LyLBtj" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Print and visualize a sample of evaluation results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sLVRdN5llA0h" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PrxM5sMZYXHP" + }, + "outputs": [], + "source": [ + "plot_bar_plot(\n", + " trajectory_eval_result,\n", + " title=\"Trajectory Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T8TipU2akHEd" + }, + "source": [ + "### Evaluate final response\n", + "\n", + "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DeK-py7ykkDN" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", + "\n", + "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cyGHGgeVklvz" + }, + "outputs": [], + "source": [ + "response_metrics = [\"safety\", \"coherence\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DaBJWcg1kn55" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wRb2EC_hknSD" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"response-{get_id()}\"\n", + "\n", + "response_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "response_eval_result = response_eval_task.evaluate(runnable=remote_1p_agent)\n", + "\n", + "display_eval_report(response_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JtewTwiwg9qH" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZODTRuq2lF75" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ntRBK3Te6PEc" + }, + "source": [ + "### Evaluate generated response conditioned by tool choosing.\n", + "\n", + "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", + "\n", + "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4bENwFcd6prX" + }, + "source": [ + "#### Define a custom metric\n", + "\n", + "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", + "\n", + "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "txGEHcg76riI" + }, + "outputs": [], + "source": [ + "criteria = {\n", + " \"Follows trajectory\": (\n", + " \"Evaluate whether the agent's response logically follows from the \"\n", + " \"sequence of actions it took. Consider these sub-points:\\n\"\n", + " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", + " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", + " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", + " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", + " )\n", + "}\n", + "\n", + "pointwise_rating_rubric = {\n", + " \"1\": \"Follows trajectory\",\n", + " \"0\": \"Does not follow trajectory\",\n", + "}\n", + "\n", + "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", + " criteria=criteria,\n", + " rating_rubric=pointwise_rating_rubric,\n", + " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8MJqXu0kikxd" + }, + "source": [ + "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5EL7iEDMikNQ" + }, + "outputs": [], + "source": [ + "print(response_follows_trajectory_prompt_template.prompt_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e1djVp7Fi4Yy" + }, + "source": [ + "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nx1xbZD87iMj" + }, + "outputs": [], + "source": [ + "response_follows_trajectory_metric = PointwiseMetric(\n", + " metric=\"response_follows_trajectory\",\n", + " metric_prompt_template=response_follows_trajectory_prompt_template,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1pmxLwTe7Ywv" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "Set new generated response evaluation metrics by including the custom metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wrsbVFDd7Ywv" + }, + "outputs": [], + "source": [ + "response_tool_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"safety\",\n", + " response_follows_trajectory_metric,\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lo-Sza807Ywv" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_dkb4gSn7Ywv" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"response-over-tools-{get_id()}\"\n", + "\n", + "response_eval_tool_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "response_eval_tool_result = response_eval_tool_task.evaluate(runnable=remote_1p_agent)\n", + "\n", + "display_eval_report(response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AtOfIFi2j88g" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GH2YvXgLlLH7" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4nuUDP3a2eTB" + }, + "source": [ + "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", + "\n", + "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pNh3-NDuZGDl" + }, + "source": [ + "### Bring your own evaluation dataset\n", + "\n", + "Define the evaluation dataset with the predicted trajectory and the generated response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y9hBgsg324Ej" + }, + "outputs": [], + "source": [ + "byod_eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"predicted_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"response\": [\n", + " 500,\n", + " 50,\n", + " \"A super fast and light usb charger\",\n", + " 100,\n", + " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " ],\n", + "}\n", + "\n", + "byod_eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oEYmU2eJ7q-1" + }, + "source": [ + "### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wBD-4wpB7q-3" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", + "\n", + "byod_response_eval_tool_task = EvalTask(\n", + " dataset=byod_eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", + " experiment_run_name=EXPERIMENT_RUN_NAME\n", + ")\n", + "\n", + "display_eval_report(byod_response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9eU3LG6r7q-3" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pQFzmd2I7q-3" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0FEbvEOkZS8f" + }, + "outputs": [], + "source": [ + "display_radar_plot(\n", + " byod_response_eval_tool_result,\n", + " title=\"Agent evaluation metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2a4e033321ad" + }, + "source": [ + "## Cleaning up\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ox2I3UfRlTOd" + }, + "outputs": [], + "source": [ + "delete_experiment = True\n", + "delete_remote_agent = True\n", + "\n", + "if delete_experiment:\n", + " try:\n", + " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", + " experiment.delete(delete_backing_tensorboard_runs=True)\n", + " except Exception as e:\n", + " print(e)\n", + "\n", + "if delete_remote_agent:\n", + " try:\n", + " remote_1p_agent.delete()\n", + " except Exception as e:\n", + " print(e)" + ] + } + ], + "metadata": { + "colab": { + "name": "evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb b/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb index 0c5037d73d..b7f65a5705 100644 --- a/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb +++ b/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb @@ -1,1708 +1,1691 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ur8xi4C7S06n" - }, - "outputs": [], - "source": [ - "# Copyright 2024 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JAPoU8Sm5E6e" - }, - "source": [ - "# Evaluate a LangGraph agent on Vertex AI Reasoning Engine (Customized template)\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \"Google
Open in Colab\n", - "
\n", - "
\n", - " \n", - " \"Google
Open in Colab Enterprise\n", - "
\n", - "
\n", - " \n", - " \"Vertex
Open in Vertex AI Workbench\n", - "
\n", - "
\n", - " \n", - " \"GitHub
View on GitHub\n", - "
\n", - "
\n", - "\n", - "
\n", - "\n", - "Share to:\n", - "\n", - "\n", - " \"LinkedIn\n", - "\n", - "\n", - "\n", - " \"Bluesky\n", - "\n", - "\n", - "\n", - " \"X\n", - "\n", - "\n", - "\n", - " \"Reddit\n", - "\n", - "\n", - "\n", - " \"Facebook\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "84f0f73a0f76" - }, - "source": [ - "| | |\n", - "|-|-|\n", - "| Authors | [Naveksha Sood](https://github.com/navekshasood) [Ivan Nardini](https://github.com/inardini) |" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tvgnzT1CKxrO" - }, - "source": [ - "## Overview\n", - "\n", - "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", - "\n", - "This tutorial shows how to evaluate a LangGraph agent with customized template on Vertex AI Reasoning Engine using Vertex AI Gen AI Evaluation.\n", - "\n", - "The tutorial uses the following Google Cloud services and resources:\n", - "\n", - "* Vertex AI Gen AI Evaluation\n", - "* Vertex AI Reasoning Engine\n", - "\n", - "The steps performed include:\n", - "\n", - "* Build and deploy an agent using LangGraph on Vertex AI Reasoning Engine\n", - "* Prepare Agent Evaluation dataset\n", - "* Single tool usage evaluation\n", - "* Trajectory evaluation\n", - "* Response evaluation\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "61RBz8LLbxCR" - }, - "source": [ - "## Get started" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "No17Cw5hgx12" - }, - "source": [ - "### Install Vertex AI SDK and other required packages\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tFy3H3aPgx12" - }, - "outputs": [], - "source": [ - "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation, langchain, reasoningengine]\" \\\n", - " \"langchain_google_vertexai\" \\\n", - " \"langgraph\" \\\n", - " \"cloudpickle==3.0.0\" \\\n", - " \"pydantic==2.7.4\" \\\n", - " \"requests\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "R5Xep4W9lq-Z" - }, - "source": [ - "### Restart runtime\n", - "\n", - "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", - "\n", - "The restart might take a minute or longer. After it's restarted, continue to the next step." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XRvKdaPDTznN" - }, - "outputs": [], - "source": [ - "import IPython\n", - "\n", - "app = IPython.Application.instance()\n", - "app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SbmM4z7FOBpM" - }, - "source": [ - "
\n", - "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", - "
\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dmWOrTJ3gx13" - }, - "source": [ - "### Authenticate your notebook environment (Colab only)\n", - "\n", - "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NyKGtVQjgx13" - }, - "outputs": [], - "source": [ - "import sys\n", - "\n", - "if \"google.colab\" in sys.modules:\n", - " from google.colab import auth\n", - "\n", - " auth.authenticate_user()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF4l8DTdWgPY" - }, - "source": [ - "### Set Google Cloud project information and initialize Vertex AI SDK\n", - "\n", - "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", - "\n", - "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nqwi-5ufWp_B" - }, - "outputs": [], - "source": [ - "# Use the environment variable if the user doesn't provide Project ID.\n", - "import os\n", - "\n", - "import vertexai\n", - "\n", - "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", - "\n", - "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", - " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", - "\n", - "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", - "\n", - "BUCKET_NAME = \"[your-bucket-name]\" # @param {type: \"string\", placeholder: \"[your-bucket-name]\", isTemplate: true}\n", - "\n", - "if not BUCKET_NAME or BUCKET_NAME == \"[your-bucket-name]\":\n", - " BUCKET_NAME = f\"{PROJECT_ID}-bucket\"\n", - "\n", - "! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI\n", - "\n", - "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", - "\n", - "vertexai.init(\n", - " project=PROJECT_ID,\n", - " location=LOCATION,\n", - " staging_bucket=BUCKET_URI,\n", - " experiment=EXPERIMENT_NAME,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5303c05f7aa6" - }, - "source": [ - "## Import libraries\n", - "\n", - "Import tutorial libraries." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6fc324893334" - }, - "outputs": [], - "source": [ - "# General\n", - "import random\n", - "import string\n", - "from typing import Literal\n", - "\n", - "from IPython.display import HTML, Markdown, display\n", - "\n", - "# Build agent\n", - "from crewai_tools import tool\n", - "\n", - "# Evaluate agent\n", - "from google.cloud import aiplatform\n", - "import pandas as pd\n", - "import plotly.graph_objects as go\n", - "from vertexai.preview import reasoning_engines\n", - "from vertexai.preview.evaluation import EvalTask\n", - "from vertexai.preview.evaluation.metrics import (\n", - " PointwiseMetric,\n", - " PointwiseMetricPromptTemplate,\n", - " TrajectorySingleToolUse,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MVnBDX54gz7j" - }, - "source": [ - "## Define helper functions\n", - "\n", - "Initiate a set of helper functions to print tutorial results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uSgWjMD_g1_v" - }, - "outputs": [], - "source": [ - "def get_id(length: int = 8) -> str:\n", - " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", - " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", - "\n", - "\n", - "def parse_crewai_output_to_dictionary(crew, crew_output):\n", - " \"\"\"\n", - " Parse CrewAI output into a structured dictionary format.\n", - " \"\"\"\n", - " final_output = {\"response\": str(crew_output), \"predicted_trajectory\": []}\n", - "\n", - " try:\n", - " # Access tools_results directly from each agent\n", - " for agent in crew.agents:\n", - " if hasattr(agent, \"tools_results\"):\n", - " for tool_result in agent.tools_results:\n", - " tool_info = {\n", - " \"tool_name\": tool_result.get(\"tool_name\", \"\"),\n", - " \"tool_input\": tool_result.get(\"tool_args\", {}),\n", - " }\n", - " final_output[\"predicted_trajectory\"].append(tool_info)\n", - "\n", - " except Exception as e:\n", - " final_output[\"error\"] = f\"Error parsing tools results: {str(e)}\"\n", - "\n", - " return final_output\n", - "\n", - "\n", - "def format_output_as_markdown(output: dict) -> str:\n", - " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", - " markdown = \"### AI Response\\n\"\n", - " markdown += f\"{output['response']}\\n\\n\"\n", - "\n", - " if output[\"predicted_trajectory\"]:\n", - " markdown += \"### Function Calls\\n\"\n", - " for call in output[\"predicted_trajectory\"]:\n", - " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", - " markdown += \" - **Arguments**:\\n\"\n", - " for key, value in call[\"tool_input\"].items():\n", - " markdown += f\" - `{key}`: `{value}`\\n\"\n", - "\n", - " return markdown\n", - "\n", - "\n", - "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", - " \"\"\"Display the evaluation results.\"\"\"\n", - " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", - " display(Markdown(\"### Summary Metrics\"))\n", - " display(metrics_df)\n", - "\n", - " display(Markdown(f\"### Row-wise Metrics\"))\n", - " display(eval_result.metrics_table)\n", - "\n", - "\n", - "def display_drilldown(row: pd.Series) -> None:\n", - " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", - "\n", - " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", - "\n", - " if not (\n", - " isinstance(row[\"predicted_trajectory\"], list)\n", - " and isinstance(row[\"reference_trajectory\"], list)\n", - " ):\n", - " return\n", - "\n", - " for predicted_trajectory, reference_trajectory in zip(\n", - " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", - " ):\n", - " display(\n", - " HTML(\n", - " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", - " )\n", - " )\n", - "\n", - " if not (\n", - " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", - " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", - " ):\n", - " continue\n", - "\n", - " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", - " print(\"Tool Input Key: \", tool_input_key)\n", - "\n", - " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", - " print(\n", - " \"Tool Values: \",\n", - " predicted_trajectory[\"tool_input\"][tool_input_key],\n", - " reference_trajectory[\"tool_input\"][tool_input_key],\n", - " )\n", - " else:\n", - " print(\n", - " \"Tool Values: \",\n", - " predicted_trajectory[\"tool_input\"][tool_input_key],\n", - " \"N/A\",\n", - " )\n", - " print(\"\\n\")\n", - " display(HTML(\"
\"))\n", - "\n", - "\n", - "def display_dataframe_rows(\n", - " df: pd.DataFrame,\n", - " columns: list[str] | None = None,\n", - " num_rows: int = 3,\n", - " display_drilldown: bool = False,\n", - ") -> None:\n", - " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", - "\n", - " if columns:\n", - " df = df[columns]\n", - "\n", - " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", - " header_style = base_style + \"font-weight: bold;\"\n", - "\n", - " for _, row in df.head(num_rows).iterrows():\n", - " for column in df.columns:\n", - " display(\n", - " HTML(\n", - " f\"{column.replace('_', ' ').title()}: \"\n", - " )\n", - " )\n", - " display(HTML(f\"{row[column]}
\"))\n", - "\n", - " display(HTML(\"
\"))\n", - "\n", - " if (\n", - " display_drilldown\n", - " and \"predicted_trajectory\" in df.columns\n", - " and \"reference_trajectory\" in df.columns\n", - " ):\n", - " display_drilldown(row)\n", - "\n", - "\n", - "def plot_bar_plot(\n", - " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", - ") -> None:\n", - " fig = go.Figure()\n", - " data = []\n", - "\n", - " summary_metrics = eval_result.summary_metrics\n", - " if metrics:\n", - " summary_metrics = {\n", - " k: summary_metrics[k]\n", - " for k, v in summary_metrics.items()\n", - " if any(selected_metric in k for selected_metric in metrics)\n", - " }\n", - "\n", - " data.append(\n", - " go.Bar(\n", - " x=list(summary_metrics.keys()),\n", - " y=list(summary_metrics.values()),\n", - " name=title,\n", - " )\n", - " )\n", - "\n", - " fig = go.Figure(data=data)\n", - "\n", - " # Change the bar mode\n", - " fig.update_layout(barmode=\"group\")\n", - " fig.show()\n", - "\n", - "\n", - "def display_radar_plot(eval_results, title: str, metrics=None):\n", - " \"\"\"Plot the radar plot.\"\"\"\n", - " fig = go.Figure()\n", - " summary_metrics = eval_results.summary_metrics\n", - " if metrics:\n", - " summary_metrics = {\n", - " k: summary_metrics[k]\n", - " for k, v in summary_metrics.items()\n", - " if any(selected_metric in k for selected_metric in metrics)\n", - " }\n", - "\n", - " min_val = min(summary_metrics.values())\n", - " max_val = max(summary_metrics.values())\n", - "\n", - " fig.add_trace(\n", - " go.Scatterpolar(\n", - " r=list(summary_metrics.values()),\n", - " theta=list(summary_metrics.keys()),\n", - " fill=\"toself\",\n", - " name=title,\n", - " )\n", - " )\n", - " fig.update_layout(\n", - " title=title,\n", - " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", - " showlegend=True,\n", - " )\n", - " fig.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bDaa2Mtsifmq" - }, - "source": [ - "## Build an agent using Vertex AI Reasoning Engine's customized template\n", - "\n", - "Build and deploy your application using LangGraph, including the Gemini model and custom tools that you define.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KHwShhpOitKp" - }, - "source": [ - "### Set tools\n", - "\n", - "To start, set the tools that a customer support agent needs to do their job." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gA2ZKvfeislw" - }, - "outputs": [], - "source": [ - "@tool\n", - "def get_product_details(product_name: str):\n", - " \"\"\"Gathers basic details about a product.\"\"\"\n", - " details = {\n", - " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", - " \"usb charger\": \"A super fast and light usb charger\",\n", - " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", - " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", - " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", - " }\n", - " return details.get(product_name, \"Product details not found.\")\n", - "\n", - "\n", - "@tool\n", - "def get_product_price(product_name: str):\n", - " \"\"\"Gathers price about a product.\"\"\"\n", - " details = {\n", - " \"smartphone\": 500,\n", - " \"usb charger\": 10,\n", - " \"shoes\": 100,\n", - " \"headphones\": 50,\n", - " \"speaker\": 80,\n", - " }\n", - " return details.get(product_name, \"Product price not found.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "be70714d9fae" - }, - "source": [ - "### Define router\n", - "\n", - "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "516b5108d327" - }, - "outputs": [], - "source": [ - "def router(\n", - " state: list[BaseMessage],\n", - ") -> Literal[\"get_product_details\", \"get_product_price\", \"__end__\"]:\n", - " \"\"\"Initiates product details or price retrieval if the user asks for a product.\"\"\"\n", - " # Get the tool_calls from the last message in the conversation history.\n", - " tool_calls = state[-1].tool_calls\n", - "\n", - " # If there are any tool_calls\n", - " if tool_calls:\n", - " # Check the function name in the first tool call\n", - " function_name = tool_calls[0].get(\"name\")\n", - " if function_name == \"get_product_price\":\n", - " return \"get_product_price\"\n", - " else:\n", - " return \"get_product_details\"\n", - " else:\n", - " # End the conversation flow.\n", - " return \"__end__\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FHjhBVx2cHWb" - }, - "source": [ - "### Set the model\n", - "\n", - "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "iCx9hbpccHWc" - }, - "outputs": [], - "source": [ - "model = \"gemini-1.5-pro\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tNlAY9cojEWz" - }, - "source": [ - "### Assemble the agent\n", - "\n", - "To create a LangGraph agent using [Vertex AI Reasoning Engine](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy), use the [customized template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize).\n", - "\n", - "This class helps you quickly get an agent using any framework running Vertex AI Reasoning Engine.\n", - "\n", - "To learn more about the template, check out [Customize an application template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize) documentation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dAFdi7SujGP8" - }, - "outputs": [], - "source": [ - "class LangGraphApp:\n", - " def __init__(self, project: str, location: str, model: str = model) -> None:\n", - " self.project_id = project\n", - " self.location = location\n", - " self.model = model\n", - "\n", - " # The set_up method is used to define application initialization logic\n", - " def set_up(self) -> None:\n", - " model = ChatVertexAI(model=self.model)\n", - " builder = MessageGraph()\n", - "\n", - " model_with_tools = model.bind_tools([get_product_details, get_product_price])\n", - " builder.add_node(\"tools\", model_with_tools)\n", - "\n", - " tool_node = ToolNode([get_product_details, get_product_price])\n", - " builder.add_node(\"get_product_details\", tool_node)\n", - " builder.add_node(\"get_product_price\", tool_node)\n", - " builder.add_edge(\"get_product_details\", END)\n", - " builder.add_edge(\"get_product_price\", END)\n", - "\n", - " builder.set_entry_point(\"tools\")\n", - " builder.add_conditional_edges(\"tools\", router)\n", - " self.app = builder.compile()\n", - "\n", - " # The query method will be used to send inputs to the agent\n", - " def query(self, input: str):\n", - " \"\"\"Query the application.\"\"\"\n", - " chat_history = langchain_load_dump.dumpd(self.app.invoke(HumanMessage(input)))\n", - " return chat_history\n", - " # return {'output': parse_messages_to_output_dictionary(chat_history)}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_HGcs6PVjRj_" - }, - "source": [ - "### Test the local agent\n", - "\n", - "Query your agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1dXLLgBudu_L" - }, - "outputs": [], - "source": [ - "local_custom_agent = LangGraphApp(project=PROJECT_ID, location=LOCATION)\n", - "local_custom_agent.set_up()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PgkOhPmN3aCZ" - }, - "outputs": [], - "source": [ - "response = local_custom_agent.query(input=\"Get product details for shoes\")\n", - "display(\n", - " Markdown(format_output_as_markdown(parse_messages_to_output_dictionary(response)))\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "lGb58OJkjUs9" - }, - "outputs": [], - "source": [ - "response = local_custom_agent.query(input=\"Get product price for shoes\")\n", - "display(\n", - " Markdown(format_output_as_markdown(parse_messages_to_output_dictionary(response)))\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2pSItXD5e4QD" - }, - "source": [ - "### Deploy the local agent to Vertex AI Reasoning Engine\n", - "\n", - "To deploy the local agent on Vertex AI Reasoning Engine, you can use the `create` method by passing the agent and some specify dependencies (`requirements` for external PyPI packages and `extra_packages` for local packages ).\n", - "\n", - "Look at [Deploy the application](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy#create_a_reasoningengine_instance) documentation page to learn more. \n", - "\n", - "> The agent deployment on Vertex AI Reasoning Engine would require ~ 10 mins." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "3HLz_a1We4QE" - }, - "outputs": [], - "source": [ - "local_custom_agent = LangGraphApp(project=PROJECT_ID, location=LOCATION)\n", - "\n", - "remote_custom_agent = reasoning_engines.ReasoningEngine.create(\n", - " local_custom_agent,\n", - " requirements=[\n", - " \"google-cloud-aiplatform[langchain,reasoningengine]\",\n", - " \"langchain_google_vertexai\",\n", - " \"langgraph\",\n", - " \"cloudpickle==3.0.0\",\n", - " \"pydantic==2.7.4\",\n", - " \"requests\",\n", - " ],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nu4RO1P9e4QE" - }, - "source": [ - "### Test the remote agent\n", - "\n", - "Query your remote agent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sqBtzYyce4QE" - }, - "outputs": [], - "source": [ - "response = remote_custom_agent.query(input=\"Get product details for shoes\")\n", - "display(\n", - " Markdown(format_output_as_markdown(parse_messages_to_output_dictionary(response)))\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aOGPePsorpUl" - }, - "source": [ - "## Evaluating a custom agent with Vertex AI Gen AI Evaluation\n", - "\n", - "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", - "\n", - "Monitoring focuses on how well your agent is performing specific tasks:\n", - "\n", - "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", - "\n", - "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", - "\n", - "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", - "\n", - "Observability is about understanding the overall health of the agent:\n", - "\n", - "* **Latency**: How long does it take the agent to respond?\n", - "\n", - "* **Failure Rate**: How often does the agent fail to produce a response?\n", - "\n", - "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e43229f3ad4f" - }, - "source": [ - "### Prepare Agent Evaluation dataset\n", - "\n", - "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", - "\n", - "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", - "\n", - "\n", - "> Optionally, you can provide both generated responses and predicted trajectory (**bring-your-own-dataset scenario**).\n", - "\n", - "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fFf8uTdUiDt3" - }, - "outputs": [], - "source": [ - "eval_data = {\n", - " \"prompt\": [\n", - " \"Get price for smartphone\",\n", - " \"Get product details and price for headphones\",\n", - " \"Get details for usb charger\",\n", - " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\",\n", - " ],\n", - " \"reference_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - "}\n", - "\n", - "eval_sample_dataset = pd.DataFrame(eval_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PQEI1EcfvFHb" - }, - "source": [ - "Print some samples from the dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "EjsonqWWvIvE" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(eval_sample_dataset, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "htCrOS9fRVi8" - }, - "source": [ - "### Prepare an Agent function\n", - "\n", - "In this scenario with a custom agent, you need an agent function to parse the agent output and pass it to Vertex AI Gen AI Evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GdO56MIDRZri" - }, - "outputs": [], - "source": [ - "def agent_parsed_response(input: str) -> dict:\n", - " \"\"\"Parse the agent output and pass it to Vertex AI Gen AI Evaluation.\"\"\"\n", - "\n", - " result = remote_custom_agent.query(input=input)\n", - "\n", - " # Parse function calls separately\n", - " agent_output = parse_messages_to_output_dictionary(result)\n", - "\n", - " return agent_output" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "m4CvBuf1afHG" - }, - "source": [ - "### Single tool usage evaluation\n", - "\n", - "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_rS5GGKHd5bx" - }, - "source": [ - "#### Set single tool usage metrics\n", - "\n", - "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", - "\n", - "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xixvq8dwd5by" - }, - "outputs": [], - "source": [ - "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ktKZoT2Qd5by" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using a local agent (local_1p_agent) and assigns a unique identifier to this specific evaluation run, storing the evaluation results.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QaMf9dqzySE6" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "SRv43fDcd5by" - }, - "outputs": [], - "source": [ - "single_tool_call_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=single_tool_usage_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", - " runnable=agent_parsed_response, experiment_run_name=EXPERIMENT_RUN\n", - ")\n", - "\n", - "display_eval_report(single_tool_call_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6o5BjSTFKVMS" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Use some helper functions to visualize a sample of evaluation result." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZkpwPReipekr" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JlujdJpu5Kn6" - }, - "source": [ - "### Trajectory Evaluation\n", - "\n", - "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8s-nHdDJneHM" - }, - "source": [ - "#### Set trajectory metrics\n", - "\n", - "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", - "\n", - "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", - "\n", - "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", - "\n", - "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", - "\n", - "* `trajectory_precision`: proportion of predicted actions present in reference\n", - "\n", - "* `trajectory_recall`: proportion of reference actions present in predicted. \n", - "\n", - "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "c32WIS95neHN" - }, - "outputs": [], - "source": [ - "trajectory_metrics = [\n", - " \"trajectory_exact_match\",\n", - " \"trajectory_in_order_match\",\n", - " \"trajectory_any_order_match\",\n", - " \"trajectory_precision\",\n", - " \"trajectory_recall\",\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF3jhTH3neHN" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vOdS7TJUneHN" - }, - "outputs": [], - "source": [ - "EXPERIMENT_NAME = \"evaluate-re-agent-trajectory\" # @param {type:\"string\"}\n", - "\n", - "trajectory_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", - ")\n", - "\n", - "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=agent_parsed_response)\n", - "\n", - "display_eval_report(trajectory_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DBiUI3LyLBtj" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "Print and visualize a sample of evaluation results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sLVRdN5llA0h" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PrxM5sMZYXHP" - }, - "outputs": [], - "source": [ - "plot_bar_plot(\n", - " trajectory_eval_result,\n", - " title=\"Trajectory Metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "T8TipU2akHEd" - }, - "source": [ - "### Evaluate final response\n", - "\n", - "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DeK-py7ykkDN" - }, - "source": [ - "#### Set response metrics\n", - "\n", - "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", - "\n", - "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cyGHGgeVklvz" - }, - "outputs": [], - "source": [ - "response_metrics = [\"safety\", \"coherence\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DaBJWcg1kn55" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wRb2EC_hknSD" - }, - "outputs": [], - "source": [ - "EXPERIMENT_NAME = \"evaluate-re-agent-response\" # @param {type:\"string\"}\n", - "\n", - "response_eval_task = EvalTask(\n", - " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", - ")\n", - "\n", - "response_eval_result = response_eval_task.evaluate(runnable=agent_parsed_response)\n", - "\n", - "display_eval_report(response_eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JtewTwiwg9qH" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "\n", - "Print new evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cy0aRydrp9zW" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ntRBK3Te6PEc" - }, - "source": [ - "### Evaluate generated response conditioned by tool choosing\n", - "\n", - "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", - "\n", - "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4bENwFcd6prX" - }, - "source": [ - "#### Define a custom metric\n", - "\n", - "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", - "\n", - "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "txGEHcg76riI" - }, - "outputs": [], - "source": [ - "criteria = {\n", - " \"Follows trajectory\": (\n", - " \"Evaluate whether the agent's response logically follows from the \"\n", - " \"sequence of actions it took. Consider these sub-points:\\n\"\n", - " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", - " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", - " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", - " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", - " )\n", - "}\n", - "\n", - "pointwise_rating_rubric = {\n", - " \"1\": \"Follows trajectory\",\n", - " \"0\": \"Does not follow trajectory\",\n", - "}\n", - "\n", - "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", - " criteria=criteria,\n", - " rating_rubric=pointwise_rating_rubric,\n", - " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8MJqXu0kikxd" - }, - "source": [ - "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5EL7iEDMikNQ" - }, - "outputs": [], - "source": [ - "print(response_follows_trajectory_prompt_template.prompt_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e1djVp7Fi4Yy" - }, - "source": [ - "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nx1xbZD87iMj" - }, - "outputs": [], - "source": [ - "response_follows_trajectory_metric = PointwiseMetric(\n", - " metric=\"response_follows_trajectory\",\n", - " metric_prompt_template=response_follows_trajectory_prompt_template,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1pmxLwTe7Ywv" - }, - "source": [ - "#### Set response metrics\n", - "\n", - "Set new generated response evaluation metrics by including the custom metric.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wrsbVFDd7Ywv" - }, - "outputs": [], - "source": [ - "response_tool_metrics = [\n", - " \"trajectory_exact_match\",\n", - " \"trajectory_in_order_match\",\n", - " \"safety\",\n", - " response_follows_trajectory_metric,\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Lo-Sza807Ywv" - }, - "source": [ - "#### Run an evaluation task\n", - "\n", - "Run a new agent's evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_dkb4gSn7Ywv" - }, - "outputs": [], - "source": [ - "EXPERIMENT_NAME = \"evaluate-re-agent-response-by-tools\" # @param {type:\"string\"}\n", - "\n", - "response_eval_tool_task = EvalTask(\n", - " dataset=eval_sample_dataset,\n", - " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "response_eval_tool_result = response_eval_tool_task.evaluate(\n", - " runnable=agent_parsed_response\n", - ")\n", - "\n", - "display_eval_report(response_eval_tool_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EFmnRBlWqJnC" - }, - "source": [ - "#### Visualize evaluation results\n", - "\n", - "\n", - "Print new evaluation result sample." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZODTRuq2lF75" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4nuUDP3a2eTB" - }, - "source": [ - "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", - "\n", - "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pNh3-NDuZGDl" - }, - "source": [ - "### Bring your own evaluation dataset\n", - "\n", - "Define the evaluation dataset with the predicted trajectory and the generated response." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "y9hBgsg324Ej" - }, - "outputs": [], - "source": [ - "byod_eval_data = {\n", - " \"prompt\": [\n", - " \"Get price for smartphone\",\n", - " \"Get product details and price for headphones\",\n", - " \"Get details for usb charger\",\n", - " \"Get product details and price for shoes\",\n", - " \"Get product details for speaker?\",\n", - " ],\n", - " \"reference_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - " \"predicted_trajectory\": [\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"smartphone\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " {\n", - " \"tool_name\": \"get_product_price\",\n", - " \"tool_input\": {\"product_name\": \"headphones\"},\n", - " },\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"usb charger\"},\n", - " }\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"shoes\"},\n", - " },\n", - " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", - " ],\n", - " [\n", - " {\n", - " \"tool_name\": \"get_product_details\",\n", - " \"tool_input\": {\"product_name\": \"speaker\"},\n", - " }\n", - " ],\n", - " ],\n", - " \"response\": [\n", - " 500,\n", - " 50,\n", - " \"A super fast and light usb charger\",\n", - " 100,\n", - " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", - " ],\n", - "}\n", - "\n", - "byod_eval_sample_dataset = pd.DataFrame(eval_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oEYmU2eJ7q-1" - }, - "source": [ - "### Run an evaluation task\n", - "\n", - "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wBD-4wpB7q-3" - }, - "outputs": [], - "source": [ - "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", - "\n", - "byod_response_eval_tool_task = EvalTask(\n", - " dataset=byod_eval_sample_dataset,\n", - " metrics=response_tool_metrics,\n", - " experiment=EXPERIMENT_NAME,\n", - ")\n", - "\n", - "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(experiment_run_name=EXPERIMENT_RUN_NAME)\n", - "\n", - "display_eval_report(byod_response_eval_tool_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9eU3LG6r7q-3" - }, - "source": [ - "### Visualize evaluation results\n", - "\n", - "Visualize evaluation result sample.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pQFzmd2I7q-3" - }, - "outputs": [], - "source": [ - "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0FEbvEOkZS8f" - }, - "outputs": [], - "source": [ - "display_radar_plot(\n", - " byod_response_eval_tool_result,\n", - " title=\"Agent evaluation metrics\",\n", - " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2a4e033321ad" - }, - "source": [ - "## Cleaning up\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Ox2I3UfRlTOd" - }, - "outputs": [], - "source": [ - "delete_experiment = True\n", - "delete_remote_agent = True\n", - "\n", - "if delete_experiment:\n", - " try:\n", - " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", - " experiment.delete(delete_backing_tensorboard_runs=True)\n", - " except Exception as e:\n", - " print(e)\n", - "\n", - "if delete_remote_agent:\n", - " try:\n", - " remote_custom_agent.delete()\n", - " except Exception as e:\n", - " print(e)" - ] - } - ], - "metadata": { - "colab": { - "name": "evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb", - "toc_visible": true - }, - "environment": { - "kernel": "python3", - "name": "common-cpu.m126", - "type": "gcloud", - "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/base-cpu:m126" - }, - "kernelspec": { - "display_name": "Python 3 (Local)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 4 + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Evaluate a LangGraph agent on Vertex AI Reasoning Engine (Customized template)\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Google
Open in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Open in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "Share to:\n", + "\n", + "\n", + " \"LinkedIn\n", + "\n", + "\n", + "\n", + " \"Bluesky\n", + "\n", + "\n", + "\n", + " \"X\n", + "\n", + "\n", + "\n", + " \"Reddit\n", + "\n", + "\n", + "\n", + " \"Facebook\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "84f0f73a0f76" + }, + "source": [ + "| | |\n", + "|-|-|\n", + "| Authors | [Naveksha Sood](https://github.com/navekshasood) [Ivan Nardini](https://github.com/inardini) |" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "Just like any Generative AI application, AI agents require thorough evaluation to ensure they perform reliably and effectively. This evaluation should happen both in real-time (online) and on large datasets of test cases (offline). Developers building agent applications face a significant challenge in evaluating their performance. Both subjective (human feedback) and objective (measurable metrics) evaluations are essential for building trust in agent behavior.\n", + "\n", + "This tutorial shows how to evaluate a LangGraph agent with customized template on Vertex AI Reasoning Engine using Vertex AI Gen AI Evaluation.\n", + "\n", + "The tutorial uses the following Google Cloud services and resources:\n", + "\n", + "* Vertex AI Gen AI Evaluation\n", + "* Vertex AI Reasoning Engine\n", + "\n", + "The steps performed include:\n", + "\n", + "* Build and deploy an agent using LangGraph on Vertex AI Reasoning Engine\n", + "* Prepare Agent Evaluation dataset\n", + "* Single tool usage evaluation\n", + "* Trajectory evaluation\n", + "* Response evaluation\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "61RBz8LLbxCR" + }, + "source": [ + "## Get started" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "No17Cw5hgx12" + }, + "source": [ + "### Install Vertex AI SDK and other required packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tFy3H3aPgx12" + }, + "outputs": [], + "source": [ + "%pip install --upgrade --user --quiet \"google-cloud-aiplatform[evaluation, langchain, reasoningengine]\" \\\n", + " \"langchain_google_vertexai\" \\\n", + " \"langgraph\" \\\n", + " \"cloudpickle==3.0.0\" \\\n", + " \"pydantic==2.7.4\" \\\n", + " \"requests\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R5Xep4W9lq-Z" + }, + "source": [ + "### Restart runtime\n", + "\n", + "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", + "\n", + "The restart might take a minute or longer. After it's restarted, continue to the next step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XRvKdaPDTznN" + }, + "outputs": [], + "source": [ + "import IPython\n", + "\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SbmM4z7FOBpM" + }, + "source": [ + "
\n", + "⚠️ The kernel is going to restart. In Colab or Colab Enterprise, you might see an error message that says \"Your session crashed for an unknown reason.\" This is expected. Wait until it's finished before continuing to the next step. ⚠️\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dmWOrTJ3gx13" + }, + "source": [ + "### Authenticate your notebook environment (Colab only)\n", + "\n", + "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NyKGtVQjgx13" + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " from google.colab import auth\n", + "\n", + " auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF4l8DTdWgPY" + }, + "source": [ + "### Set Google Cloud project information and initialize Vertex AI SDK\n", + "\n", + "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nqwi-5ufWp_B" + }, + "outputs": [], + "source": [ + "# Use the environment variable if the user doesn't provide Project ID.\n", + "import os\n", + "\n", + "import vertexai\n", + "\n", + "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", + "\n", + "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", + " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", + "\n", + "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", + "\n", + "BUCKET_NAME = \"[your-bucket-name]\" # @param {type: \"string\", placeholder: \"[your-bucket-name]\", isTemplate: true}\n", + "\n", + "if not BUCKET_NAME or BUCKET_NAME == \"[your-bucket-name]\":\n", + " BUCKET_NAME = f\"{PROJECT_ID}-bucket\"\n", + "\n", + "! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI\n", + "\n", + "EXPERIMENT_NAME = \"evaluate-re-agent\" # @param {type:\"string\"}\n", + "\n", + "vertexai.init(\n", + " project=PROJECT_ID,\n", + " location=LOCATION,\n", + " staging_bucket=BUCKET_URI,\n", + " experiment=EXPERIMENT_NAME,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5303c05f7aa6" + }, + "source": [ + "## Import libraries\n", + "\n", + "Import tutorial libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6fc324893334" + }, + "outputs": [], + "source": [ + "# General\n", + "import random\n", + "import string\n", + "from typing import Literal\n", + "\n", + "from IPython.display import HTML, Markdown, display\n", + "\n", + "# Build agent\n", + "from crewai_tools import tool\n", + "\n", + "# Evaluate agent\n", + "from google.cloud import aiplatform\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "from vertexai.preview import reasoning_engines\n", + "from vertexai.preview.evaluation import EvalTask\n", + "from vertexai.preview.evaluation.metrics import (\n", + " PointwiseMetric,\n", + " PointwiseMetricPromptTemplate,\n", + " TrajectorySingleToolUse,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MVnBDX54gz7j" + }, + "source": [ + "## Define helper functions\n", + "\n", + "Initiate a set of helper functions to print tutorial results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uSgWjMD_g1_v" + }, + "outputs": [], + "source": [ + "def get_id(length: int = 8) -> str:\n", + " \"\"\"Generate a uuid of a specified length (default=8).\"\"\"\n", + " return \"\".join(random.choices(string.ascii_lowercase + string.digits, k=length))\n", + "\n", + "\n", + "def parse_crewai_output_to_dictionary(crew, crew_output):\n", + " \"\"\"\n", + " Parse CrewAI output into a structured dictionary format.\n", + " \"\"\"\n", + " final_output = {\"response\": str(crew_output), \"predicted_trajectory\": []}\n", + "\n", + " try:\n", + " # Access tools_results directly from each agent\n", + " for agent in crew.agents:\n", + " if hasattr(agent, \"tools_results\"):\n", + " for tool_result in agent.tools_results:\n", + " tool_info = {\n", + " \"tool_name\": tool_result.get(\"tool_name\", \"\"),\n", + " \"tool_input\": tool_result.get(\"tool_args\", {}),\n", + " }\n", + " final_output[\"predicted_trajectory\"].append(tool_info)\n", + "\n", + " except Exception as e:\n", + " final_output[\"error\"] = f\"Error parsing tools results: {str(e)}\"\n", + "\n", + " return final_output\n", + "\n", + "\n", + "def format_output_as_markdown(output: dict) -> str:\n", + " \"\"\"Convert the output dictionary to a formatted markdown string.\"\"\"\n", + " markdown = \"### AI Response\\n\"\n", + " markdown += f\"{output['response']}\\n\\n\"\n", + "\n", + " if output[\"predicted_trajectory\"]:\n", + " markdown += \"### Function Calls\\n\"\n", + " for call in output[\"predicted_trajectory\"]:\n", + " markdown += f\"- **Function**: `{call['tool_name']}`\\n\"\n", + " markdown += \" - **Arguments**:\\n\"\n", + " for key, value in call[\"tool_input\"].items():\n", + " markdown += f\" - `{key}`: `{value}`\\n\"\n", + "\n", + " return markdown\n", + "\n", + "\n", + "def display_eval_report(eval_result: pd.DataFrame) -> None:\n", + " \"\"\"Display the evaluation results.\"\"\"\n", + " metrics_df = pd.DataFrame.from_dict(eval_result.summary_metrics, orient=\"index\").T\n", + " display(Markdown(\"### Summary Metrics\"))\n", + " display(metrics_df)\n", + "\n", + " display(Markdown(f\"### Row-wise Metrics\"))\n", + " display(eval_result.metrics_table)\n", + "\n", + "\n", + "def display_drilldown(row: pd.Series) -> None:\n", + " \"\"\"Displays a drill-down view for trajectory data within a row.\"\"\"\n", + "\n", + " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", + "\n", + " if not (\n", + " isinstance(row[\"predicted_trajectory\"], list)\n", + " and isinstance(row[\"reference_trajectory\"], list)\n", + " ):\n", + " return\n", + "\n", + " for predicted_trajectory, reference_trajectory in zip(\n", + " row[\"predicted_trajectory\"], row[\"reference_trajectory\"]\n", + " ):\n", + " display(\n", + " HTML(\n", + " f\"

Tool Names:

{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}
\"\n", + " )\n", + " )\n", + "\n", + " if not (\n", + " isinstance(predicted_trajectory.get(\"tool_input\"), dict)\n", + " and isinstance(reference_trajectory.get(\"tool_input\"), dict)\n", + " ):\n", + " continue\n", + "\n", + " for tool_input_key in predicted_trajectory[\"tool_input\"]:\n", + " print(\"Tool Input Key: \", tool_input_key)\n", + "\n", + " if tool_input_key in reference_trajectory[\"tool_input\"]:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " reference_trajectory[\"tool_input\"][tool_input_key],\n", + " )\n", + " else:\n", + " print(\n", + " \"Tool Values: \",\n", + " predicted_trajectory[\"tool_input\"][tool_input_key],\n", + " \"N/A\",\n", + " )\n", + " print(\"\\n\")\n", + " display(HTML(\"
\"))\n", + "\n", + "\n", + "def display_dataframe_rows(\n", + " df: pd.DataFrame,\n", + " columns: list[str] | None = None,\n", + " num_rows: int = 3,\n", + " display_drilldown: bool = False,\n", + ") -> None:\n", + " \"\"\"Displays a subset of rows from a DataFrame, optionally including a drill-down view.\"\"\"\n", + "\n", + " if columns:\n", + " df = df[columns]\n", + "\n", + " base_style = \"font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;\"\n", + " header_style = base_style + \"font-weight: bold;\"\n", + "\n", + " for _, row in df.head(num_rows).iterrows():\n", + " for column in df.columns:\n", + " display(\n", + " HTML(\n", + " f\"{column.replace('_', ' ').title()}: \"\n", + " )\n", + " )\n", + " display(HTML(f\"{row[column]}
\"))\n", + "\n", + " display(HTML(\"
\"))\n", + "\n", + " if (\n", + " display_drilldown\n", + " and \"predicted_trajectory\" in df.columns\n", + " and \"reference_trajectory\" in df.columns\n", + " ):\n", + " display_drilldown(row)\n", + "\n", + "\n", + "def plot_bar_plot(\n", + " eval_result: pd.DataFrame, title: str, metrics: list[str] = None\n", + ") -> None:\n", + " fig = go.Figure()\n", + " data = []\n", + "\n", + " summary_metrics = eval_result.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " data.append(\n", + " go.Bar(\n", + " x=list(summary_metrics.keys()),\n", + " y=list(summary_metrics.values()),\n", + " name=title,\n", + " )\n", + " )\n", + "\n", + " fig = go.Figure(data=data)\n", + "\n", + " # Change the bar mode\n", + " fig.update_layout(barmode=\"group\")\n", + " fig.show()\n", + "\n", + "\n", + "def display_radar_plot(eval_results, title: str, metrics=None):\n", + " \"\"\"Plot the radar plot.\"\"\"\n", + " fig = go.Figure()\n", + " summary_metrics = eval_results.summary_metrics\n", + " if metrics:\n", + " summary_metrics = {\n", + " k: summary_metrics[k]\n", + " for k, v in summary_metrics.items()\n", + " if any(selected_metric in k for selected_metric in metrics)\n", + " }\n", + "\n", + " min_val = min(summary_metrics.values())\n", + " max_val = max(summary_metrics.values())\n", + "\n", + " fig.add_trace(\n", + " go.Scatterpolar(\n", + " r=list(summary_metrics.values()),\n", + " theta=list(summary_metrics.keys()),\n", + " fill=\"toself\",\n", + " name=title,\n", + " )\n", + " )\n", + " fig.update_layout(\n", + " title=title,\n", + " polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),\n", + " showlegend=True,\n", + " )\n", + " fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bDaa2Mtsifmq" + }, + "source": [ + "## Build an agent using Vertex AI Reasoning Engine's customized template\n", + "\n", + "Build and deploy your application using LangGraph, including the Gemini model and custom tools that you define.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KHwShhpOitKp" + }, + "source": [ + "### Set tools\n", + "\n", + "To start, set the tools that a customer support agent needs to do their job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gA2ZKvfeislw" + }, + "outputs": [], + "source": [ + "@tool\n", + "def get_product_details(product_name: str):\n", + " \"\"\"Gathers basic details about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": \"A cutting-edge smartphone with advanced camera features and lightning-fast processing.\",\n", + " \"usb charger\": \"A super fast and light usb charger\",\n", + " \"shoes\": \"High-performance running shoes designed for comfort, support, and speed.\",\n", + " \"headphones\": \"Wireless headphones with advanced noise cancellation technology for immersive audio.\",\n", + " \"speaker\": \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " }\n", + " return details.get(product_name, \"Product details not found.\")\n", + "\n", + "\n", + "@tool\n", + "def get_product_price(product_name: str):\n", + " \"\"\"Gathers price about a product.\"\"\"\n", + " details = {\n", + " \"smartphone\": 500,\n", + " \"usb charger\": 10,\n", + " \"shoes\": 100,\n", + " \"headphones\": 50,\n", + " \"speaker\": 80,\n", + " }\n", + " return details.get(product_name, \"Product price not found.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "be70714d9fae" + }, + "source": [ + "### Define router\n", + "\n", + "Set up a router to direct conversation flow by selecting the appropriate tool based on user input or interaction state.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "516b5108d327" + }, + "outputs": [], + "source": [ + "def router(\n", + " state: list[BaseMessage],\n", + ") -> Literal[\"get_product_details\", \"get_product_price\", \"__end__\"]:\n", + " \"\"\"Initiates product details or price retrieval if the user asks for a product.\"\"\"\n", + " # Get the tool_calls from the last message in the conversation history.\n", + " tool_calls = state[-1].tool_calls\n", + "\n", + " # If there are any tool_calls\n", + " if tool_calls:\n", + " # Check the function name in the first tool call\n", + " function_name = tool_calls[0].get(\"name\")\n", + " if function_name == \"get_product_price\":\n", + " return \"get_product_price\"\n", + " else:\n", + " return \"get_product_details\"\n", + " else:\n", + " # End the conversation flow.\n", + " return \"__end__\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FHjhBVx2cHWb" + }, + "source": [ + "### Set the model\n", + "\n", + "Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iCx9hbpccHWc" + }, + "outputs": [], + "source": [ + "model = \"gemini-1.5-pro\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNlAY9cojEWz" + }, + "source": [ + "### Assemble the agent\n", + "\n", + "To create a LangGraph agent using [Vertex AI Reasoning Engine](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy), use the [customized template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize).\n", + "\n", + "This class helps you quickly get an agent using any framework running Vertex AI Reasoning Engine.\n", + "\n", + "To learn more about the template, check out [Customize an application template](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/customize) documentation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dAFdi7SujGP8" + }, + "outputs": [], + "source": [ + "class LangGraphApp:\n", + " def __init__(self, project: str, location: str, model: str = model) -> None:\n", + " self.project_id = project\n", + " self.location = location\n", + " self.model = model\n", + "\n", + " # The set_up method is used to define application initialization logic\n", + " def set_up(self) -> None:\n", + " model = ChatVertexAI(model=self.model)\n", + " builder = MessageGraph()\n", + "\n", + " model_with_tools = model.bind_tools([get_product_details, get_product_price])\n", + " builder.add_node(\"tools\", model_with_tools)\n", + "\n", + " tool_node = ToolNode([get_product_details, get_product_price])\n", + " builder.add_node(\"get_product_details\", tool_node)\n", + " builder.add_node(\"get_product_price\", tool_node)\n", + " builder.add_edge(\"get_product_details\", END)\n", + " builder.add_edge(\"get_product_price\", END)\n", + "\n", + " builder.set_entry_point(\"tools\")\n", + " builder.add_conditional_edges(\"tools\", router)\n", + " self.app = builder.compile()\n", + "\n", + " # The query method will be used to send inputs to the agent\n", + " def query(self, input: str):\n", + " \"\"\"Query the application.\"\"\"\n", + " chat_history = langchain_load_dump.dumpd(self.app.invoke(HumanMessage(input)))\n", + " return chat_history\n", + " # return {'output': parse_messages_to_output_dictionary(chat_history)}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_HGcs6PVjRj_" + }, + "source": [ + "### Test the local agent\n", + "\n", + "Query your agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1dXLLgBudu_L" + }, + "outputs": [], + "source": [ + "local_custom_agent = LangGraphApp(project=PROJECT_ID, location=LOCATION)\n", + "local_custom_agent.set_up()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PgkOhPmN3aCZ" + }, + "outputs": [], + "source": [ + "response = local_custom_agent.query(input=\"Get product details for shoes\")\n", + "display(\n", + " Markdown(format_output_as_markdown(parse_messages_to_output_dictionary(response)))\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lGb58OJkjUs9" + }, + "outputs": [], + "source": [ + "response = local_custom_agent.query(input=\"Get product price for shoes\")\n", + "display(\n", + " Markdown(format_output_as_markdown(parse_messages_to_output_dictionary(response)))\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2pSItXD5e4QD" + }, + "source": [ + "### Deploy the local agent to Vertex AI Reasoning Engine\n", + "\n", + "To deploy the local agent on Vertex AI Reasoning Engine, you can use the `create` method by passing the agent and some specify dependencies (`requirements` for external PyPI packages and `extra_packages` for local packages ).\n", + "\n", + "Look at [Deploy the application](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/deploy#create_a_reasoningengine_instance) documentation page to learn more. \n", + "\n", + "> The agent deployment on Vertex AI Reasoning Engine would require ~ 10 mins." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3HLz_a1We4QE" + }, + "outputs": [], + "source": [ + "local_custom_agent = LangGraphApp(project=PROJECT_ID, location=LOCATION)\n", + "\n", + "remote_custom_agent = reasoning_engines.ReasoningEngine.create(\n", + " local_custom_agent,\n", + " requirements=[\n", + " \"google-cloud-aiplatform[langchain,reasoningengine]\",\n", + " \"langchain_google_vertexai\",\n", + " \"langgraph\",\n", + " \"cloudpickle==3.0.0\",\n", + " \"pydantic==2.7.4\",\n", + " \"requests\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nu4RO1P9e4QE" + }, + "source": [ + "### Test the remote agent\n", + "\n", + "Query your remote agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sqBtzYyce4QE" + }, + "outputs": [], + "source": [ + "response = remote_custom_agent.query(input=\"Get product details for shoes\")\n", + "display(\n", + " Markdown(format_output_as_markdown(parse_messages_to_output_dictionary(response)))\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aOGPePsorpUl" + }, + "source": [ + "## Evaluating a custom agent with Vertex AI Gen AI Evaluation\n", + "\n", + "When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.\n", + "\n", + "Monitoring focuses on how well your agent is performing specific tasks:\n", + "\n", + "* **Single Tool Selection**: Is the agent choosing the right tools for the job?\n", + "\n", + "* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?\n", + "\n", + "* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?\n", + "\n", + "Observability is about understanding the overall health of the agent:\n", + "\n", + "* **Latency**: How long does it take the agent to respond?\n", + "\n", + "* **Failure Rate**: How often does the agent fail to produce a response?\n", + "\n", + "Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e43229f3ad4f" + }, + "source": [ + "### Prepare Agent Evaluation dataset\n", + "\n", + "To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent. \n", + "\n", + "This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.\n", + "\n", + "\n", + "> Optionally, you can provide both generated responses and predicted trajectory (**bring-your-own-dataset scenario**).\n", + "\n", + "Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fFf8uTdUiDt3" + }, + "outputs": [], + "source": [ + "eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + "}\n", + "\n", + "eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PQEI1EcfvFHb" + }, + "source": [ + "Print some samples from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EjsonqWWvIvE" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(eval_sample_dataset, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "htCrOS9fRVi8" + }, + "source": [ + "### Prepare an Agent function\n", + "\n", + "In this scenario with a custom agent, you need an agent function to parse the agent output and pass it to Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GdO56MIDRZri" + }, + "outputs": [], + "source": [ + "def agent_parsed_response(input: str) -> dict:\n", + " \"\"\"Parse the agent output and pass it to Vertex AI Gen AI Evaluation.\"\"\"\n", + "\n", + " result = remote_custom_agent.query(input=input)\n", + "\n", + " # Parse function calls separately\n", + " agent_output = parse_messages_to_output_dictionary(result)\n", + "\n", + " return agent_output" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m4CvBuf1afHG" + }, + "source": [ + "### Single tool usage evaluation\n", + "\n", + "After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_rS5GGKHd5bx" + }, + "source": [ + "#### Set single tool usage metrics\n", + "\n", + "The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.\n", + "\n", + "To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to \"send an email\", you might expect the agent to use an \"send_email\" tool, and you'd specify that tool's name when using this metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xixvq8dwd5by" + }, + "outputs": [], + "source": [ + "single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name=\"get_product_price\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ktKZoT2Qd5by" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using a local agent (local_1p_agent) and assigns a unique identifier to this specific evaluation run, storing the evaluation results.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QaMf9dqzySE6" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN = f\"single-metric-eval-{get_id()}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SRv43fDcd5by" + }, + "outputs": [], + "source": [ + "single_tool_call_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=single_tool_usage_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "single_tool_call_eval_result = single_tool_call_eval_task.evaluate(\n", + " runnable=agent_parsed_response, experiment_run_name=EXPERIMENT_RUN\n", + ")\n", + "\n", + "display_eval_report(single_tool_call_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6o5BjSTFKVMS" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Use some helper functions to visualize a sample of evaluation result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZkpwPReipekr" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JlujdJpu5Kn6" + }, + "source": [ + "### Trajectory Evaluation\n", + "\n", + "After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8s-nHdDJneHM" + }, + "source": [ + "#### Set trajectory metrics\n", + "\n", + "To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:\n", + "\n", + "* `trajectory_exact_match`: identical trajectories (same actions, same order)\n", + "\n", + "* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)\n", + "\n", + "* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).\n", + "\n", + "* `trajectory_precision`: proportion of predicted actions present in reference\n", + "\n", + "* `trajectory_recall`: proportion of reference actions present in predicted. \n", + "\n", + "All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c32WIS95neHN" + }, + "outputs": [], + "source": [ + "trajectory_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"trajectory_any_order_match\",\n", + " \"trajectory_precision\",\n", + " \"trajectory_recall\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF3jhTH3neHN" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vOdS7TJUneHN" + }, + "outputs": [], + "source": [ + "EXPERIMENT_NAME = \"evaluate-re-agent-trajectory\" # @param {type:\"string\"}\n", + "\n", + "trajectory_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=trajectory_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "trajectory_eval_result = trajectory_eval_task.evaluate(runnable=agent_parsed_response)\n", + "\n", + "display_eval_report(trajectory_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DBiUI3LyLBtj" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "Print and visualize a sample of evaluation results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sLVRdN5llA0h" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PrxM5sMZYXHP" + }, + "outputs": [], + "source": [ + "plot_bar_plot(\n", + " trajectory_eval_result,\n", + " title=\"Trajectory Metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in trajectory_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T8TipU2akHEd" + }, + "source": [ + "### Evaluate final response\n", + "\n", + "Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DeK-py7ykkDN" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.\n", + "\n", + "Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cyGHGgeVklvz" + }, + "outputs": [], + "source": [ + "response_metrics = [\"safety\", \"coherence\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DaBJWcg1kn55" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wRb2EC_hknSD" + }, + "outputs": [], + "source": [ + "EXPERIMENT_NAME = \"evaluate-re-agent-response\" # @param {type:\"string\"}\n", + "\n", + "response_eval_task = EvalTask(\n", + " dataset=eval_sample_dataset, metrics=response_metrics, experiment=EXPERIMENT_NAME\n", + ")\n", + "\n", + "response_eval_result = response_eval_task.evaluate(runnable=agent_parsed_response)\n", + "\n", + "display_eval_report(response_eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JtewTwiwg9qH" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cy0aRydrp9zW" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ntRBK3Te6PEc" + }, + "source": [ + "### Evaluate generated response conditioned by tool choosing\n", + "\n", + "When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.\n", + "\n", + "Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4bENwFcd6prX" + }, + "source": [ + "#### Define a custom metric\n", + "\n", + "According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.\n", + "\n", + "Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "txGEHcg76riI" + }, + "outputs": [], + "source": [ + "criteria = {\n", + " \"Follows trajectory\": (\n", + " \"Evaluate whether the agent's response logically follows from the \"\n", + " \"sequence of actions it took. Consider these sub-points:\\n\"\n", + " \" - Does the response reflect the information gathered during the trajectory?\\n\"\n", + " \" - Is the response consistent with the goals and constraints of the task?\\n\"\n", + " \" - Are there any unexpected or illogical jumps in reasoning?\\n\"\n", + " \"Provide specific examples from the trajectory and response to support your evaluation.\"\n", + " )\n", + "}\n", + "\n", + "pointwise_rating_rubric = {\n", + " \"1\": \"Follows trajectory\",\n", + " \"0\": \"Does not follow trajectory\",\n", + "}\n", + "\n", + "response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(\n", + " criteria=criteria,\n", + " rating_rubric=pointwise_rating_rubric,\n", + " input_variables=[\"prompt\", \"predicted_trajectory\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8MJqXu0kikxd" + }, + "source": [ + "Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5EL7iEDMikNQ" + }, + "outputs": [], + "source": [ + "print(response_follows_trajectory_prompt_template.prompt_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e1djVp7Fi4Yy" + }, + "source": [ + "After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nx1xbZD87iMj" + }, + "outputs": [], + "source": [ + "response_follows_trajectory_metric = PointwiseMetric(\n", + " metric=\"response_follows_trajectory\",\n", + " metric_prompt_template=response_follows_trajectory_prompt_template,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1pmxLwTe7Ywv" + }, + "source": [ + "#### Set response metrics\n", + "\n", + "Set new generated response evaluation metrics by including the custom metric.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wrsbVFDd7Ywv" + }, + "outputs": [], + "source": [ + "response_tool_metrics = [\n", + " \"trajectory_exact_match\",\n", + " \"trajectory_in_order_match\",\n", + " \"safety\",\n", + " response_follows_trajectory_metric,\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lo-Sza807Ywv" + }, + "source": [ + "#### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_dkb4gSn7Ywv" + }, + "outputs": [], + "source": [ + "EXPERIMENT_NAME = \"evaluate-re-agent-response-by-tools\" # @param {type:\"string\"}\n", + "\n", + "response_eval_tool_task = EvalTask(\n", + " dataset=eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "response_eval_tool_result = response_eval_tool_task.evaluate(\n", + " runnable=agent_parsed_response\n", + ")\n", + "\n", + "display_eval_report(response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EFmnRBlWqJnC" + }, + "source": [ + "#### Visualize evaluation results\n", + "\n", + "\n", + "Print new evaluation result sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZODTRuq2lF75" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4nuUDP3a2eTB" + }, + "source": [ + "## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation\n", + "\n", + "In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pNh3-NDuZGDl" + }, + "source": [ + "### Bring your own evaluation dataset\n", + "\n", + "Define the evaluation dataset with the predicted trajectory and the generated response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y9hBgsg324Ej" + }, + "outputs": [], + "source": [ + "byod_eval_data = {\n", + " \"prompt\": [\n", + " \"Get price for smartphone\",\n", + " \"Get product details and price for headphones\",\n", + " \"Get details for usb charger\",\n", + " \"Get product details and price for shoes\",\n", + " \"Get product details for speaker?\",\n", + " ],\n", + " \"reference_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"predicted_trajectory\": [\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"smartphone\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " {\n", + " \"tool_name\": \"get_product_price\",\n", + " \"tool_input\": {\"product_name\": \"headphones\"},\n", + " },\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"usb charger\"},\n", + " }\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"shoes\"},\n", + " },\n", + " {\"tool_name\": \"get_product_price\", \"tool_input\": {\"product_name\": \"shoes\"}},\n", + " ],\n", + " [\n", + " {\n", + " \"tool_name\": \"get_product_details\",\n", + " \"tool_input\": {\"product_name\": \"speaker\"},\n", + " }\n", + " ],\n", + " ],\n", + " \"response\": [\n", + " 500,\n", + " 50,\n", + " \"A super fast and light usb charger\",\n", + " 100,\n", + " \"A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.\",\n", + " ],\n", + "}\n", + "\n", + "byod_eval_sample_dataset = pd.DataFrame(eval_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oEYmU2eJ7q-1" + }, + "source": [ + "### Run an evaluation task\n", + "\n", + "Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wBD-4wpB7q-3" + }, + "outputs": [], + "source": [ + "EXPERIMENT_RUN_NAME = f\"response-over-tools-byod-{get_id()}\"\n", + "\n", + "byod_response_eval_tool_task = EvalTask(\n", + " dataset=byod_eval_sample_dataset,\n", + " metrics=response_tool_metrics,\n", + " experiment=EXPERIMENT_NAME,\n", + ")\n", + "\n", + "byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(\n", + " experiment_run_name=EXPERIMENT_RUN_NAME\n", + ")\n", + "\n", + "display_eval_report(byod_response_eval_tool_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9eU3LG6r7q-3" + }, + "source": [ + "### Visualize evaluation results\n", + "\n", + "Visualize evaluation result sample.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pQFzmd2I7q-3" + }, + "outputs": [], + "source": [ + "display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0FEbvEOkZS8f" + }, + "outputs": [], + "source": [ + "display_radar_plot(\n", + " byod_response_eval_tool_result,\n", + " title=\"Agent evaluation metrics\",\n", + " metrics=[f\"{metric}/mean\" for metric in response_tool_metrics],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2a4e033321ad" + }, + "source": [ + "## Cleaning up\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ox2I3UfRlTOd" + }, + "outputs": [], + "source": [ + "delete_experiment = True\n", + "delete_remote_agent = True\n", + "\n", + "if delete_experiment:\n", + " try:\n", + " experiment = aiplatform.Experiment(EXPERIMENT_NAME)\n", + " experiment.delete(delete_backing_tensorboard_runs=True)\n", + " except Exception as e:\n", + " print(e)\n", + "\n", + "if delete_remote_agent:\n", + " try:\n", + " remote_custom_agent.delete()\n", + " except Exception as e:\n", + " print(e)" + ] + } + ], + "metadata": { + "colab": { + "name": "evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } From 487a48e65ad2477675089ca597a3ac4004047de7 Mon Sep 17 00:00:00 2001 From: Holt Skinner Date: Wed, 18 Dec 2024 10:40:04 -0600 Subject: [PATCH 13/14] Fix spelling issue `runnning` --- .github/actions/spelling/allow.txt | 1 - gemini/evaluation/evaluating_langgraph_agent.ipynb | 2 +- ...ting_crewai_agent_reasoning_engine_customized_template.ipynb | 2 +- ...ing_langchain_agent_reasoning_engine_prebuilt_template.ipynb | 2 +- ...g_langgraph_agent_reasoning_engine_customized_template.ipynb | 2 +- 5 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/actions/spelling/allow.txt b/.github/actions/spelling/allow.txt index d3ba8ce728..29e1122872 100644 --- a/.github/actions/spelling/allow.txt +++ b/.github/actions/spelling/allow.txt @@ -1059,7 +1059,6 @@ rrf rsc rsp runjdwp -runnning saaagesh saveddir scann diff --git a/gemini/evaluation/evaluating_langgraph_agent.ipynb b/gemini/evaluation/evaluating_langgraph_agent.ipynb index 227f09a035..7b7aafe841 100644 --- a/gemini/evaluation/evaluating_langgraph_agent.ipynb +++ b/gemini/evaluation/evaluating_langgraph_agent.ipynb @@ -989,7 +989,7 @@ "source": [ "#### Run an evaluation task\n", "\n", - "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." + "Submit an evaluation by running `evaluate` method of the new `EvalTask`." ] }, { diff --git a/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb b/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb index 292e5f25cc..94edbc96e9 100644 --- a/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb +++ b/gemini/reasoning-engine/evaluating_crewai_agent_reasoning_engine_customized_template.ipynb @@ -1118,7 +1118,7 @@ "source": [ "#### Run an evaluation task\n", "\n", - "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." + "Submit an evaluation by running `evaluate` method of the new `EvalTask`." ] }, { diff --git a/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb b/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb index f129ee14c3..bd48074f2a 100644 --- a/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb +++ b/gemini/reasoning-engine/evaluating_langchain_agent_reasoning_engine_prebuilt_template.ipynb @@ -955,7 +955,7 @@ "source": [ "#### Run an evaluation task\n", "\n", - "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." + "Submit an evaluation by running `evaluate` method of the new `EvalTask`." ] }, { diff --git a/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb b/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb index b7f65a5705..5462940dd3 100644 --- a/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb +++ b/gemini/reasoning-engine/evaluating_langgraph_agent_reasoning_engine_customized_template.ipynb @@ -1115,7 +1115,7 @@ "source": [ "#### Run an evaluation task\n", "\n", - "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." + "Submit an evaluation by running `evaluate` method of the new `EvalTask`." ] }, { From 4ac964a83ca1633256b197035b801b5c71568fad Mon Sep 17 00:00:00 2001 From: Holt Skinner Date: Wed, 18 Dec 2024 10:42:28 -0600 Subject: [PATCH 14/14] Fix spelling --- gemini/evaluation/evaluating_crewai_agent.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gemini/evaluation/evaluating_crewai_agent.ipynb b/gemini/evaluation/evaluating_crewai_agent.ipynb index 471f95ebea..aa3395af1e 100644 --- a/gemini/evaluation/evaluating_crewai_agent.ipynb +++ b/gemini/evaluation/evaluating_crewai_agent.ipynb @@ -984,7 +984,7 @@ "source": [ "#### Run an evaluation task\n", "\n", - "Submit an evaluation by runnning `evaluate` method of the new `EvalTask`." + "Submit an evaluation by running `evaluate` method of the new `EvalTask`." ] }, {