feat: add support for Comet and MetricX as target metrics in the note…

…books (#1531) # Description 1. add support of Comet and MetricX as target metrics in the notebooks 2. fix the RESPONSE_MIME_TYPE format error in the UI notebook 3. fix the eval_metrics_weight error in the UI notebook 4. use "text/plain" as the default value for MIME_TYPE 5. update the supported metrics list in all notebooks Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [x] Follow the [`CONTRIBUTING` Guide](https://github.com/GoogleCloudPlatform/generative-ai/blob/main/CONTRIBUTING.md). - [x] You are listed as the author in your notebook or README file. - [x] Your account is listed in [`CODEOWNERS`](https://github.com/GoogleCloudPlatform/generative-ai/blob/main/.github/CODEOWNERS) for the file(s). - [x] Make your Pull Request title in the <https://www.conventionalcommits.org/> specification. - [x] Ensure the tests and linter pass (Run `nox -s format` from the repository root to format). - [x] Appropriate docs were updated (if necessary)
GoogleCloudPlatform · Dec 13, 2024 · 4712889 · 4712889
1 parent 9b4518d
commit 4712889
Show file tree

Hide file tree

Showing 6 changed files with 26 additions and 17 deletions.
diff --git a/.github/actions/spelling/allow.txt b/.github/actions/spelling/allow.txt
@@ -1210,3 +1210,4 @@ ytd
 yticks
 zakarid
 zaxis
+metricx
diff --git a/gemini/prompts/prompt_optimizer/vertex_ai_prompt_optimizer_sdk.ipynb b/gemini/prompts/prompt_optimizer/vertex_ai_prompt_optimizer_sdk.ipynb
@@ -910,7 +910,7 @@
         "    eval_metrics_types=[\n",
         "        \"question_answering_correctness\",\n",
         "        \"groundedness\",\n",
-        "    ],  # List of evaluation metrics. List of strings. Supported metrics: \"bleu\", \"coherence\", \"exact_match\", \"fluidity\", \"fulfillment\", \"groundedness\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_helpfulness\", \"question_answering_quality\", \"question_answering_relevance\", \"summarization_helpfulness\", \"summarization_quality\", \"summarization_verbosity\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\"\n",
+        "    ],  # List of evaluation metrics. List of strings. Supported metrics: \"bleu\", \"coherence\", \"comet\", \"exact_match\", \"fluency\", \"groundedness\", \"metricx\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_quality\", \"summarization_quality\", \"text_quality\", \"verbosity\", \"tool_call_valid\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\"\n",
         "    eval_metrics_weights=[\n",
         "        0.9,\n",
         "        0.1,\n",
@@ -935,13 +935,14 @@
         "    optimizer_model_qps=1,  # The queries per second (QPS) sent to the optimization model. Integer greater or equal than 1 depending on your quota.\n",
         "    eval_qps=1,  # The queries per second (QPS) sent to the eval model. Integer greater or equal than 1 depending on your quota.\n",
         "    source_model_qps=\"\",  # The queries per second (QPS) sent to the source model. Integer greater or equal than 1 depending on your quota.\n",
-        "    response_mime_type=\"application/json\",  # MIME response type that the target model uses. String. Supported response: text/plain, text/x.enum, application/json.\n",
+        "    response_mime_type=\"text/plain\",  # MIME response type that the target model uses. String. Supported response: text/plain, text/x.enum, application/json.\n",
         "    response_schema=\"\",  # The Vertex AI's Controlled Generation response schema that the target model uses to generate answers. String.\n",
         "    language=\"English\",  # Language of the system instructions. String. Supported languages: \"English\", \"French\", \"German\", \"Hebrew\", \"Hindi\", \"Italian\", \"Japanese\", \"Korean\", \"Portuguese\", \"Simplified Chinese\", \"Spanish\", \"Traditional Chinese\"\n",
         "    placeholder_to_content=json.loads(\n",
         "        \"{}\"\n",
         "    ),  # Placeholder to replace any parameter in the system instruction. Dict.\n",
         "    data_limit=10,  # Amount of data used for validation. Integer between 5 and 100.\n",
+        "    translation_source_field_name=\"\", # Fill in with the corresponding field name of the source text in the data if translation metrics like Comet or MetricX are selected. Otherwise, leave it as empty.\n",
         ")"
       ]
     },

diff --git a/gemini/prompts/prompt_optimizer/vertex_ai_prompt_optimizer_sdk_custom_metric.ipynb b/gemini/prompts/prompt_optimizer/vertex_ai_prompt_optimizer_sdk_custom_metric.ipynb
@@ -1153,7 +1153,7 @@
         "    eval_metrics_types=[\n",
         "        \"question_answering_correctness\",\n",
         "        \"custom_metric\",\n",
-        "    ],  # List of evaluation metrics. List of strings. Supported metrics: \"bleu\", \"coherence\", \"exact_match\", \"fluidity\", \"fulfillment\", \"groundedness\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_helpfulness\", \"question_answering_quality\", \"question_answering_relevance\", \"summarization_helpfulness\", \"summarization_quality\", \"summarization_verbosity\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\"\n",
+        "    ],  # List of evaluation metrics. List of strings. Supported metrics: \"bleu\", \"coherence\", \"comet\", \"exact_match\", \"fluency\", \"groundedness\", \"metricx\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_quality\", \"summarization_quality\", \"text_quality\", \"verbosity\", \"tool_call_valid\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\"\n",
         "    eval_metrics_weights=[\n",
         "        0.8,\n",
         "        0.2,\n",
@@ -1178,13 +1178,14 @@
         "    optimizer_model_qps=1,  # The queries per second (QPS) sent to the optimization model. Integer greater or equal than 1 depending on your quota.\n",
         "    eval_qps=1,  # The queries per second (QPS) sent to the eval model. Integer greater or equal than 1 depending on your quota.\n",
         "    source_model_qps=\"\",  # The queries per second (QPS) sent to the source model. Integer greater or equal than 1 depending on your quota.\n",
-        "    response_mime_type=\"application/json\",  # MIME response type that the target model uses. String. Supported response: text/plain, text/x.enum, application/json.\n",
+        "    response_mime_type=\"text/plain\",  # MIME response type that the target model uses. String. Supported response: text/plain, text/x.enum, application/json.\n",
         "    response_schema=\"\",  # The Vertex AI's Controlled Generation response schema that the target model uses to generate answers. String.\n",
         "    language=\"English\",  # Language of the system instructions. String. Supported languages: \"English\", \"French\", \"German\", \"Hebrew\", \"Hindi\", \"Italian\", \"Japanese\", \"Korean\", \"Portuguese\", \"Simplified Chinese\", \"Spanish\", \"Traditional Chinese\"\n",
         "    placeholder_to_content=json.loads(\n",
         "        \"{}\"\n",
         "    ),  # Placeholder to replace any parameter in the system instruction. Dict.\n",
         "    data_limit=10,  # Amount of data used for validation. Integer between 5 and 100.\n",
+        "    translation_source_field_name=\"\", # Fill in with the corresponding field name of the source text in the data if translation metrics like Comet or MetricX are selected. Otherwise, leave it as empty.\n",
         ")"
       ]
     },

diff --git a/...ni/prompts/prompt_optimizer/vertex_ai_prompt_optimizer_sdk_long_prompt_optimization.ipynb b/...ni/prompts/prompt_optimizer/vertex_ai_prompt_optimizer_sdk_long_prompt_optimization.ipynb
@@ -815,7 +815,7 @@
         "    optimization_mode=\"instruction\",  # Optimization mode. String. Supported modes: \"instruction\", \"demonstration\", \"instruction_and_demo\"\n",
         "    eval_metrics_types=[\n",
         "        \"question_answering_correctness\",\n",
-        "    ],  # List of evaluation metrics. List of strings. Supported metrics: \"bleu\", \"coherence\", \"exact_match\", \"fluidity\", \"fulfillment\", \"groundedness\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_helpfulness\", \"question_answering_quality\", \"question_answering_relevance\", \"summarization_helpfulness\", \"summarization_quality\", \"summarization_verbosity\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\"\n",
+        "    ],  # List of evaluation metrics. List of strings. Supported metrics: \"bleu\", \"coherence\", \"comet\", \"exact_match\", \"fluency\", \"groundedness\", \"metricx\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_quality\", \"summarization_quality\", \"text_quality\", \"verbosity\", \"tool_call_valid\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\"\n",
         "    eval_metrics_weights=[\n",
         "        1.0,\n",
         "    ],  # Weights for evaluation metrics. List of floats.  Length must match eval_metrics_types.  Should sum to 1.\n",
@@ -839,13 +839,14 @@
         "    optimizer_model_qps=1,  # The queries per second (QPS) sent to the optimization model. Integer greater or equal than 1 depending on your quota.\n",
         "    eval_qps=1,  # The queries per second (QPS) sent to the eval model. Integer greater or equal than 1 depending on your quota.\n",
         "    source_model_qps=\"\",  # The queries per second (QPS) sent to the source model. Integer greater or equal than 1 depending on your quota.\n",
-        "    response_mime_type=\"application/json\",  # MIME response type that the target model uses. String. Supported response: text/plain, application/json.\n",
+        "    response_mime_type=\"application/json\",  # MIME response type that the target model uses. String. Supported response: text/plain, text/x.enum, application/json.\n",
         "    response_schema=RESPONSE_SCHEMA,  # The Vertex AI's Controlled Generation response schema that the target model uses to generate answers. String.\n",
-        "    language=\"English\",  # Language of the system instructions. String. Supported languages: \"English\", \"French\", \"German\", \"Hebrew\", \"Hindi\", \"Japanese\", \"Korean\", \"Portuguese\", \"Simplified Chinese\", \"Spanish\", \"Traditional Chinese\"\n",
+        "    language=\"English\",  # Language of the system instructions. String. Supported languages: \"English\", \"French\", \"German\", \"Hebrew\", \"Hindi\", \"Italian\", \"Japanese\", \"Korean\", \"Portuguese\", \"Simplified Chinese\", \"Spanish\", \"Traditional Chinese\"\n",
         "    placeholder_to_content=json.loads(\n",
         "        PLACEHOLDER_TO_CONTENT\n",
         "    ),  # Placeholder to replace any parameter in the system instruction. Dict.\n",
         "    data_limit=10,  # Amount of data used for validation. Integer between 5 and 100.\n",
+        "    translation_source_field_name=\"\", # Fill in with the corresponding field name of the source text in the data if translation metrics like Comet or MetricX are selected. Otherwise, leave it as empty.\n",
         ")"
       ]
     },

diff --git a/gemini/prompts/prompt_optimizer/vertex_ai_prompt_optimizer_sdk_tool_calling.ipynb b/gemini/prompts/prompt_optimizer/vertex_ai_prompt_optimizer_sdk_tool_calling.ipynb
@@ -1131,7 +1131,7 @@
         "        \"tool_name_match\",\n",
         "        \"tool_parameter_key_match\",\n",
         "        \"tool_parameter_kv_match\",\n",
-        "    ],  # List of evaluation metrics. List of strings. Supported metrics: \"bleu\", \"coherence\", \"exact_match\", \"fluidity\", \"fulfillment\", \"groundedness\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_helpfulness\", \"question_answering_quality\", \"question_answering_relevance\", \"summarization_helpfulness\", \"summarization_quality\", \"summarization_verbosity\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\"\n",
+        "    ],  # List of evaluation metrics. List of strings. Supported metrics: \"bleu\", \"coherence\", \"exact_match\", \"fluency\", \"groundedness\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_quality\", \"summarization_quality\", \"text_quality\", \"verbosity\", \"tool_call_valid\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\"\n",
         "    eval_metrics_weights=[\n",
         "        0.4,\n",
         "        0.3,\n",

diff --git a/gemini/prompts/prompt_optimizer/vertex_ai_prompt_optimizer_ui.ipynb b/gemini/prompts/prompt_optimizer/vertex_ai_prompt_optimizer_ui.ipynb
@@ -230,6 +230,8 @@
         "* Target Model: Which model you are trying to optimize your prompts to.\n",
         "* Optimization Mode: The mode in which you are trying to optimize your prompt with.\n",
         "* Evaluation Metrics: The evaluation metrics in which you are trying to optimize your prompts against.\n",
+        "* Translation Source Field Name: fill in with the corresponding field name of the source text in the data if translation metrics like Comet or MetricX are selected. Otherwise, leave it as empty.\n",
+        "\n",
         "Refer [here](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/prompts/prompt-optimizer#configuration) to learn more about the different configuration settings and how to best utilize them."
       ]
     },
@@ -244,7 +246,8 @@
         "SOURCE_MODEL = \"\"  # @param [\"\", \"gemini-1.0-pro-001\", \"gemini-1.0-pro-002\", \"gemini-1.5-flash-001\", \"gemini-1.5-flash-002\", \"gemini-1.5-pro-001\", \"gemini-1.5-pro-002\", \"gemini-1.0-ultra-001\", \"gemini-experimental\", \"gemini-flash-experimental\", \"gemini-pro-experimental\", \"text-bison@001\", \"text-bison@002\", \"text-bison32k@002\", \"text-unicorn@001\"]\n",
         "TARGET_MODEL = \"gemini-1.5-flash-001\"  # @param [\"gemini-1.0-pro-001\", \"gemini-1.0-pro-002\", \"gemini-1.5-flash-001\", \"gemini-1.5-flash-002\", \"gemini-1.5-pro-001\", \"gemini-1.5-pro-002\", \"gemini-1.0-ultra-001\", \"gemini-experimental\", \"gemini-flash-experimental\", \"gemini-pro-experimental\"]\n",
         "OPTIMIZATION_MODE = \"instruction_and_demo\"  # @param [\"instruction\", \"demonstration\", \"instruction_and_demo\"]\n",
-        "EVAL_METRIC = \"question_answering_correctness\"  # @param [\"bleu\", \"coherence\", \"exact_match\", \"fluency\", \"groundedness\", \"text_quality\", \"verbosity\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_quality\", \"summarization_quality\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\", \"tool_call_valid\"] {type:\"string\"}"
+        "EVAL_METRIC = \"question_answering_correctness\"  # @param [\"bleu\", \"coherence\", \"comet\", \"exact_match\", \"fluency\", \"groundedness\", \"metricx\", \"text_quality\", \"verbosity\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_quality\", \"summarization_quality\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\", \"tool_call_valid\"] {type:\"string\"}\n",
+        "TRANSLATION_SOURCE_FIELD_NAME = \"\"  # @param {type:\"string\"}"
       ]
     },
     {
@@ -280,19 +283,17 @@
         "\n",
         "# @markdown **Multi-metric Configs**: <br/>\n",
         "# @markdown Use this section only if you need more than one metric for optimization. This will override the metric you picked above.\n",
-        "EVAL_METRIC_1 = \"NA\"  # @param [\"NA\", \"bleu\", \"coherence\", \"exact_match\", \"fluency\", \"groundedness\", \"text_quality\", \"verbosity\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_quality\", \"summarization_quality\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\", \"tool_call_valid\"] {type:\"string\"}\n",
+        "EVAL_METRIC_1 = \"NA\"  # @param [\"NA\", \"bleu\", \"coherence\", \"comet\", \"exact_match\", \"fluency\", \"groundedness\", \"metricx\", \"text_quality\", \"verbosity\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_quality\", \"summarization_quality\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\", \"tool_call_valid\"] {type:\"string\"}\n",
         "EVAL_METRIC_1_WEIGHT = 0.0  # @param {type:\"number\"}\n",
-        "EVAL_METRIC_2 = \"NA\"  # @param [\"NA\", \"bleu\", \"coherence\", \"exact_match\", \"fluency\", \"groundedness\", \"text_quality\", \"verbosity\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_quality\", \"summarization_quality\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\", \"tool_call_valid\"] {type:\"string\"}\n",
+        "EVAL_METRIC_2 = \"NA\"  # @param [\"NA\", \"bleu\", \"coherence\", \"comet\", \"exact_match\", \"fluency\", \"groundedness\", \"metricx\", \"text_quality\", \"verbosity\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_quality\", \"summarization_quality\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\", \"tool_call_valid\"] {type:\"string\"}\n",
         "EVAL_METRIC_2_WEIGHT = 0.0  # @param {type:\"number\"}\n",
-        "EVAL_METRIC_3 = \"NA\"  # @param [\"NA\", \"bleu\", \"coherence\", \"exact_match\", \"fluency\", \"groundedness\", \"text_quality\", \"verbosity\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_quality\", \"summarization_quality\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\", \"tool_call_valid\"] {type:\"string\"}\n",
+        "EVAL_METRIC_3 = \"NA\"  # @param [\"NA\", \"bleu\", \"coherence\", \"comet\", \"exact_match\", \"fluency\", \"groundedness\", \"metricx\", \"text_quality\", \"verbosity\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_quality\", \"summarization_quality\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\", \"tool_call_valid\"] {type:\"string\"}\n",
         "EVAL_METRIC_3_WEIGHT = 0.0  # @param {type:\"number\"}\n",
         "METRIC_AGGREGATION_TYPE = \"weighted_sum\"  # @param [\"weighted_sum\", \"weighted_average\"]\n",
         "\n",
         "# @markdown **Misc Configs**: <br/>\n",
         "PLACEHOLDER_TO_VALUE = \"{}\"  # @param\n",
-        "RESPONSE_MIME_TYPE = (\n",
-        "    \"text/plain\"  # @param [\"text/plain\", \"application/json\", \"text/x.enum\"]\n",
-        ")\n",
+        "RESPONSE_MIME_TYPE = \"text/plain\"  # @param [\"text/plain\", \"application/json\", \"text/x.enum\"]\n",
         "RESPONSE_SCHEMA = \"\"\n",
         "TARGET_LANGUAGE = \"English\"  # @param [\"English\", \"French\", \"German\", \"Hebrew\", \"Hindi\", \"Italian\", \"Japanese\", \"Korean\", \"Portuguese\", \"Simplified Chinese\", \"Spanish\", \"Traditional Chinese\"]\n",
         "TOOLS = \"\"  # @param\n",
@@ -370,6 +371,7 @@
         "    \"placeholder_to_content\": json.loads(PLACEHOLDER_TO_VALUE),\n",
         "    \"tools\": TOOLS,\n",
         "    \"tool_config\": TOOL_CONFIG,\n",
+        "    \"translation_source_field_name\": TRANSLATION_SOURCE_FIELD_NAME,\n",
         "}\n",
         "\n",
         "if EVAL_METRIC_1 == \"NA\":\n",
@@ -378,11 +380,14 @@
         "else:\n",
         "    metrics = []\n",
         "    weights = []\n",
-        "    for metric in [EVAL_METRIC_1, EVAL_METRIC_2, EVAL_METRIC_3]:\n",
+        "    for metric, weight in zip(\n",
+        "        [EVAL_METRIC_1, EVAL_METRIC_2, EVAL_METRIC_3],\n",
+        "        [EVAL_METRIC_1_WEIGHT, EVAL_METRIC_2_WEIGHT, EVAL_METRIC_3_WEIGHT],\n",
+        "    ):\n",
         "        if metric == \"NA\":\n",
         "            break\n",
         "        metrics.append(metric)\n",
-        "        weights.append(EVAL_METRIC_1_WEIGHT)\n",
+        "        weights.append(weight)\n",
         "    params[\"eval_metrics_types\"] = metrics\n",
         "    params[\"eval_metrics_weights\"] = weights\n",
         "\n",
-Original file line number
+Diff line change
@@ Expand Up / @@ -1210,3 +1210,4 @@ ytd @@
     yticks
     zakarid
     zaxis
+    metricx