Skip to content

Commit

Permalink
feat: add support for Comet and MetricX as target metrics in the note…
Browse files Browse the repository at this point in the history
…books (#1531)

# Description
1. add support of Comet and MetricX as target metrics in the notebooks
2. fix the RESPONSE_MIME_TYPE format error in the UI notebook
3. fix the eval_metrics_weight error in the UI notebook
4. use "text/plain" as the default value for MIME_TYPE
5. update the supported metrics list in all notebooks

Thank you for opening a Pull Request!
Before submitting your PR, there are a few things you can do to make
sure it goes smoothly:

- [x] Follow the [`CONTRIBUTING`
Guide](https://github.com/GoogleCloudPlatform/generative-ai/blob/main/CONTRIBUTING.md).
- [x] You are listed as the author in your notebook or README file.
- [x] Your account is listed in
[`CODEOWNERS`](https://github.com/GoogleCloudPlatform/generative-ai/blob/main/.github/CODEOWNERS)
for the file(s).
- [x] Make your Pull Request title in the
<https://www.conventionalcommits.org/> specification.
- [x] Ensure the tests and linter pass (Run `nox -s format` from the
repository root to format).
- [x] Appropriate docs were updated (if necessary)
  • Loading branch information
want-to-be-relaxed authored Dec 13, 2024
1 parent 9b4518d commit 4712889
Show file tree
Hide file tree
Showing 6 changed files with 26 additions and 17 deletions.
1 change: 1 addition & 0 deletions .github/actions/spelling/allow.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1210,3 +1210,4 @@ ytd
yticks
zakarid
zaxis
metricx
Original file line number Diff line number Diff line change
Expand Up @@ -910,7 +910,7 @@
" eval_metrics_types=[\n",
" \"question_answering_correctness\",\n",
" \"groundedness\",\n",
" ], # List of evaluation metrics. List of strings. Supported metrics: \"bleu\", \"coherence\", \"exact_match\", \"fluidity\", \"fulfillment\", \"groundedness\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_helpfulness\", \"question_answering_quality\", \"question_answering_relevance\", \"summarization_helpfulness\", \"summarization_quality\", \"summarization_verbosity\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\"\n",
" ], # List of evaluation metrics. List of strings. Supported metrics: \"bleu\", \"coherence\", \"comet\", \"exact_match\", \"fluency\", \"groundedness\", \"metricx\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_quality\", \"summarization_quality\", \"text_quality\", \"verbosity\", \"tool_call_valid\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\"\n",
" eval_metrics_weights=[\n",
" 0.9,\n",
" 0.1,\n",
Expand All @@ -935,13 +935,14 @@
" optimizer_model_qps=1, # The queries per second (QPS) sent to the optimization model. Integer greater or equal than 1 depending on your quota.\n",
" eval_qps=1, # The queries per second (QPS) sent to the eval model. Integer greater or equal than 1 depending on your quota.\n",
" source_model_qps=\"\", # The queries per second (QPS) sent to the source model. Integer greater or equal than 1 depending on your quota.\n",
" response_mime_type=\"application/json\", # MIME response type that the target model uses. String. Supported response: text/plain, text/x.enum, application/json.\n",
" response_mime_type=\"text/plain\", # MIME response type that the target model uses. String. Supported response: text/plain, text/x.enum, application/json.\n",
" response_schema=\"\", # The Vertex AI's Controlled Generation response schema that the target model uses to generate answers. String.\n",
" language=\"English\", # Language of the system instructions. String. Supported languages: \"English\", \"French\", \"German\", \"Hebrew\", \"Hindi\", \"Italian\", \"Japanese\", \"Korean\", \"Portuguese\", \"Simplified Chinese\", \"Spanish\", \"Traditional Chinese\"\n",
" placeholder_to_content=json.loads(\n",
" \"{}\"\n",
" ), # Placeholder to replace any parameter in the system instruction. Dict.\n",
" data_limit=10, # Amount of data used for validation. Integer between 5 and 100.\n",
" translation_source_field_name=\"\", # Fill in with the corresponding field name of the source text in the data if translation metrics like Comet or MetricX are selected. Otherwise, leave it as empty.\n",
")"
]
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1153,7 +1153,7 @@
" eval_metrics_types=[\n",
" \"question_answering_correctness\",\n",
" \"custom_metric\",\n",
" ], # List of evaluation metrics. List of strings. Supported metrics: \"bleu\", \"coherence\", \"exact_match\", \"fluidity\", \"fulfillment\", \"groundedness\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_helpfulness\", \"question_answering_quality\", \"question_answering_relevance\", \"summarization_helpfulness\", \"summarization_quality\", \"summarization_verbosity\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\"\n",
" ], # List of evaluation metrics. List of strings. Supported metrics: \"bleu\", \"coherence\", \"comet\", \"exact_match\", \"fluency\", \"groundedness\", \"metricx\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_quality\", \"summarization_quality\", \"text_quality\", \"verbosity\", \"tool_call_valid\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\"\n",
" eval_metrics_weights=[\n",
" 0.8,\n",
" 0.2,\n",
Expand All @@ -1178,13 +1178,14 @@
" optimizer_model_qps=1, # The queries per second (QPS) sent to the optimization model. Integer greater or equal than 1 depending on your quota.\n",
" eval_qps=1, # The queries per second (QPS) sent to the eval model. Integer greater or equal than 1 depending on your quota.\n",
" source_model_qps=\"\", # The queries per second (QPS) sent to the source model. Integer greater or equal than 1 depending on your quota.\n",
" response_mime_type=\"application/json\", # MIME response type that the target model uses. String. Supported response: text/plain, text/x.enum, application/json.\n",
" response_mime_type=\"text/plain\", # MIME response type that the target model uses. String. Supported response: text/plain, text/x.enum, application/json.\n",
" response_schema=\"\", # The Vertex AI's Controlled Generation response schema that the target model uses to generate answers. String.\n",
" language=\"English\", # Language of the system instructions. String. Supported languages: \"English\", \"French\", \"German\", \"Hebrew\", \"Hindi\", \"Italian\", \"Japanese\", \"Korean\", \"Portuguese\", \"Simplified Chinese\", \"Spanish\", \"Traditional Chinese\"\n",
" placeholder_to_content=json.loads(\n",
" \"{}\"\n",
" ), # Placeholder to replace any parameter in the system instruction. Dict.\n",
" data_limit=10, # Amount of data used for validation. Integer between 5 and 100.\n",
" translation_source_field_name=\"\", # Fill in with the corresponding field name of the source text in the data if translation metrics like Comet or MetricX are selected. Otherwise, leave it as empty.\n",
")"
]
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -815,7 +815,7 @@
" optimization_mode=\"instruction\", # Optimization mode. String. Supported modes: \"instruction\", \"demonstration\", \"instruction_and_demo\"\n",
" eval_metrics_types=[\n",
" \"question_answering_correctness\",\n",
" ], # List of evaluation metrics. List of strings. Supported metrics: \"bleu\", \"coherence\", \"exact_match\", \"fluidity\", \"fulfillment\", \"groundedness\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_helpfulness\", \"question_answering_quality\", \"question_answering_relevance\", \"summarization_helpfulness\", \"summarization_quality\", \"summarization_verbosity\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\"\n",
" ], # List of evaluation metrics. List of strings. Supported metrics: \"bleu\", \"coherence\", \"comet\", \"exact_match\", \"fluency\", \"groundedness\", \"metricx\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_quality\", \"summarization_quality\", \"text_quality\", \"verbosity\", \"tool_call_valid\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\"\n",
" eval_metrics_weights=[\n",
" 1.0,\n",
" ], # Weights for evaluation metrics. List of floats. Length must match eval_metrics_types. Should sum to 1.\n",
Expand All @@ -839,13 +839,14 @@
" optimizer_model_qps=1, # The queries per second (QPS) sent to the optimization model. Integer greater or equal than 1 depending on your quota.\n",
" eval_qps=1, # The queries per second (QPS) sent to the eval model. Integer greater or equal than 1 depending on your quota.\n",
" source_model_qps=\"\", # The queries per second (QPS) sent to the source model. Integer greater or equal than 1 depending on your quota.\n",
" response_mime_type=\"application/json\", # MIME response type that the target model uses. String. Supported response: text/plain, application/json.\n",
" response_mime_type=\"application/json\", # MIME response type that the target model uses. String. Supported response: text/plain, text/x.enum, application/json.\n",
" response_schema=RESPONSE_SCHEMA, # The Vertex AI's Controlled Generation response schema that the target model uses to generate answers. String.\n",
" language=\"English\", # Language of the system instructions. String. Supported languages: \"English\", \"French\", \"German\", \"Hebrew\", \"Hindi\", \"Japanese\", \"Korean\", \"Portuguese\", \"Simplified Chinese\", \"Spanish\", \"Traditional Chinese\"\n",
" language=\"English\", # Language of the system instructions. String. Supported languages: \"English\", \"French\", \"German\", \"Hebrew\", \"Hindi\", \"Italian\", \"Japanese\", \"Korean\", \"Portuguese\", \"Simplified Chinese\", \"Spanish\", \"Traditional Chinese\"\n",
" placeholder_to_content=json.loads(\n",
" PLACEHOLDER_TO_CONTENT\n",
" ), # Placeholder to replace any parameter in the system instruction. Dict.\n",
" data_limit=10, # Amount of data used for validation. Integer between 5 and 100.\n",
" translation_source_field_name=\"\", # Fill in with the corresponding field name of the source text in the data if translation metrics like Comet or MetricX are selected. Otherwise, leave it as empty.\n",
")"
]
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1131,7 +1131,7 @@
" \"tool_name_match\",\n",
" \"tool_parameter_key_match\",\n",
" \"tool_parameter_kv_match\",\n",
" ], # List of evaluation metrics. List of strings. Supported metrics: \"bleu\", \"coherence\", \"exact_match\", \"fluidity\", \"fulfillment\", \"groundedness\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_helpfulness\", \"question_answering_quality\", \"question_answering_relevance\", \"summarization_helpfulness\", \"summarization_quality\", \"summarization_verbosity\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\"\n",
" ], # List of evaluation metrics. List of strings. Supported metrics: \"bleu\", \"coherence\", \"exact_match\", \"fluency\", \"groundedness\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_quality\", \"summarization_quality\", \"text_quality\", \"verbosity\", \"tool_call_valid\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\"\n",
" eval_metrics_weights=[\n",
" 0.4,\n",
" 0.3,\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,8 @@
"* Target Model: Which model you are trying to optimize your prompts to.\n",
"* Optimization Mode: The mode in which you are trying to optimize your prompt with.\n",
"* Evaluation Metrics: The evaluation metrics in which you are trying to optimize your prompts against.\n",
"* Translation Source Field Name: fill in with the corresponding field name of the source text in the data if translation metrics like Comet or MetricX are selected. Otherwise, leave it as empty.\n",
"\n",
"Refer [here](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/prompts/prompt-optimizer#configuration) to learn more about the different configuration settings and how to best utilize them."
]
},
Expand All @@ -244,7 +246,8 @@
"SOURCE_MODEL = \"\" # @param [\"\", \"gemini-1.0-pro-001\", \"gemini-1.0-pro-002\", \"gemini-1.5-flash-001\", \"gemini-1.5-flash-002\", \"gemini-1.5-pro-001\", \"gemini-1.5-pro-002\", \"gemini-1.0-ultra-001\", \"gemini-experimental\", \"gemini-flash-experimental\", \"gemini-pro-experimental\", \"text-bison@001\", \"text-bison@002\", \"text-bison32k@002\", \"text-unicorn@001\"]\n",
"TARGET_MODEL = \"gemini-1.5-flash-001\" # @param [\"gemini-1.0-pro-001\", \"gemini-1.0-pro-002\", \"gemini-1.5-flash-001\", \"gemini-1.5-flash-002\", \"gemini-1.5-pro-001\", \"gemini-1.5-pro-002\", \"gemini-1.0-ultra-001\", \"gemini-experimental\", \"gemini-flash-experimental\", \"gemini-pro-experimental\"]\n",
"OPTIMIZATION_MODE = \"instruction_and_demo\" # @param [\"instruction\", \"demonstration\", \"instruction_and_demo\"]\n",
"EVAL_METRIC = \"question_answering_correctness\" # @param [\"bleu\", \"coherence\", \"exact_match\", \"fluency\", \"groundedness\", \"text_quality\", \"verbosity\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_quality\", \"summarization_quality\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\", \"tool_call_valid\"] {type:\"string\"}"
"EVAL_METRIC = \"question_answering_correctness\" # @param [\"bleu\", \"coherence\", \"comet\", \"exact_match\", \"fluency\", \"groundedness\", \"metricx\", \"text_quality\", \"verbosity\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_quality\", \"summarization_quality\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\", \"tool_call_valid\"] {type:\"string\"}\n",
"TRANSLATION_SOURCE_FIELD_NAME = \"\" # @param {type:\"string\"}"
]
},
{
Expand Down Expand Up @@ -280,19 +283,17 @@
"\n",
"# @markdown **Multi-metric Configs**: <br/>\n",
"# @markdown Use this section only if you need more than one metric for optimization. This will override the metric you picked above.\n",
"EVAL_METRIC_1 = \"NA\" # @param [\"NA\", \"bleu\", \"coherence\", \"exact_match\", \"fluency\", \"groundedness\", \"text_quality\", \"verbosity\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_quality\", \"summarization_quality\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\", \"tool_call_valid\"] {type:\"string\"}\n",
"EVAL_METRIC_1 = \"NA\" # @param [\"NA\", \"bleu\", \"coherence\", \"comet\", \"exact_match\", \"fluency\", \"groundedness\", \"metricx\", \"text_quality\", \"verbosity\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_quality\", \"summarization_quality\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\", \"tool_call_valid\"] {type:\"string\"}\n",
"EVAL_METRIC_1_WEIGHT = 0.0 # @param {type:\"number\"}\n",
"EVAL_METRIC_2 = \"NA\" # @param [\"NA\", \"bleu\", \"coherence\", \"exact_match\", \"fluency\", \"groundedness\", \"text_quality\", \"verbosity\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_quality\", \"summarization_quality\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\", \"tool_call_valid\"] {type:\"string\"}\n",
"EVAL_METRIC_2 = \"NA\" # @param [\"NA\", \"bleu\", \"coherence\", \"comet\", \"exact_match\", \"fluency\", \"groundedness\", \"metricx\", \"text_quality\", \"verbosity\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_quality\", \"summarization_quality\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\", \"tool_call_valid\"] {type:\"string\"}\n",
"EVAL_METRIC_2_WEIGHT = 0.0 # @param {type:\"number\"}\n",
"EVAL_METRIC_3 = \"NA\" # @param [\"NA\", \"bleu\", \"coherence\", \"exact_match\", \"fluency\", \"groundedness\", \"text_quality\", \"verbosity\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_quality\", \"summarization_quality\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\", \"tool_call_valid\"] {type:\"string\"}\n",
"EVAL_METRIC_3 = \"NA\" # @param [\"NA\", \"bleu\", \"coherence\", \"comet\", \"exact_match\", \"fluency\", \"groundedness\", \"metricx\", \"text_quality\", \"verbosity\", \"rouge_1\", \"rouge_2\", \"rouge_l\", \"rouge_l_sum\", \"safety\", \"question_answering_correctness\", \"question_answering_quality\", \"summarization_quality\", \"tool_name_match\", \"tool_parameter_key_match\", \"tool_parameter_kv_match\", \"tool_call_valid\"] {type:\"string\"}\n",
"EVAL_METRIC_3_WEIGHT = 0.0 # @param {type:\"number\"}\n",
"METRIC_AGGREGATION_TYPE = \"weighted_sum\" # @param [\"weighted_sum\", \"weighted_average\"]\n",
"\n",
"# @markdown **Misc Configs**: <br/>\n",
"PLACEHOLDER_TO_VALUE = \"{}\" # @param\n",
"RESPONSE_MIME_TYPE = (\n",
" \"text/plain\" # @param [\"text/plain\", \"application/json\", \"text/x.enum\"]\n",
")\n",
"RESPONSE_MIME_TYPE = \"text/plain\" # @param [\"text/plain\", \"application/json\", \"text/x.enum\"]\n",
"RESPONSE_SCHEMA = \"\"\n",
"TARGET_LANGUAGE = \"English\" # @param [\"English\", \"French\", \"German\", \"Hebrew\", \"Hindi\", \"Italian\", \"Japanese\", \"Korean\", \"Portuguese\", \"Simplified Chinese\", \"Spanish\", \"Traditional Chinese\"]\n",
"TOOLS = \"\" # @param\n",
Expand Down Expand Up @@ -370,6 +371,7 @@
" \"placeholder_to_content\": json.loads(PLACEHOLDER_TO_VALUE),\n",
" \"tools\": TOOLS,\n",
" \"tool_config\": TOOL_CONFIG,\n",
" \"translation_source_field_name\": TRANSLATION_SOURCE_FIELD_NAME,\n",
"}\n",
"\n",
"if EVAL_METRIC_1 == \"NA\":\n",
Expand All @@ -378,11 +380,14 @@
"else:\n",
" metrics = []\n",
" weights = []\n",
" for metric in [EVAL_METRIC_1, EVAL_METRIC_2, EVAL_METRIC_3]:\n",
" for metric, weight in zip(\n",
" [EVAL_METRIC_1, EVAL_METRIC_2, EVAL_METRIC_3],\n",
" [EVAL_METRIC_1_WEIGHT, EVAL_METRIC_2_WEIGHT, EVAL_METRIC_3_WEIGHT],\n",
" ):\n",
" if metric == \"NA\":\n",
" break\n",
" metrics.append(metric)\n",
" weights.append(EVAL_METRIC_1_WEIGHT)\n",
" weights.append(weight)\n",
" params[\"eval_metrics_types\"] = metrics\n",
" params[\"eval_metrics_weights\"] = weights\n",
"\n",
Expand Down

0 comments on commit 4712889

Please sign in to comment.