Skip to content

Commit

Permalink
feat(components): Use a single inference component for AutoSxS
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 601877680
  • Loading branch information
TheMichaelHu authored and Google Cloud Pipeline Components maintainers committed Jan 26, 2024
1 parent c803a69 commit 8c7b5b2
Show file tree
Hide file tree
Showing 5 changed files with 168 additions and 133 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def arbiter_preprocess(
and responses.
id_columns: The columns which distinguish unique evaluation examples.
response_column_a: The column containing responses for model a.
response_column_b: The column containing responses for model a.
response_column_b: The column containing responses for model b.
task: Task to evaluate.
output_path: Path to write the path where preprocessed predictions are
stored.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,16 +31,16 @@ def _resolve_image() -> str:
@dsl.container_component
def autosxs_metrics_computer(
judgments_dir: str,
has_human_preference: bool,
autosxs_metrics: dsl.Output[dsl.Metrics], # pylint: disable=unused-argument # pytype: disable=unsupported-operands
gcp_resources: dsl.OutputPath(str), # pytype: disable=invalid-annotation
human_preference_column: str = '',
) -> dsl.ContainerSpec: # pylint: disable=g-doc-args
"""Compute AutoSXS metrics using judgments outputs from Arbiter.
Args:
judgments_dir: Path where store the Judgments.
has_human_preference: Boolean value. True if users provided human preference
data, otherwise false.
human_preference_column: The column containing ground truths. The default
value is an empty string if not be provided by users.
Returns:
autosxs_metrics: Autosxs win rate metrics and human alignment metrics.
Expand All @@ -58,7 +58,7 @@ def autosxs_metrics_computer(
'--', # Used to mark the start of component flags.
'autosxs_metrics',
f'--judgments_dir={judgments_dir}',
f'--has_human_preference={has_human_preference}',
f'--human_preference_column={human_preference_column}',
'--executor_input={{$.json_escape[1]}}',
],
),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
# Copyright 2024 The Kubeflow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Component for running LLM Batch Prediction jobs side-by-side."""

import os
from typing import Any, Dict, List

from google_cloud_pipeline_components import _placeholders
from google_cloud_pipeline_components import utils as gcpc_utils
from google_cloud_pipeline_components._implementation.llm import utils
from kfp import dsl


def _resolve_image() -> str:
"""Determines the image URI to create a container from."""
return os.environ.get(
'AUTOSXS_IMAGE_OVERRIDE'
) or utils.get_default_image_uri('autosxs')


# pylint: disable=unused-argument,dangerous-default-value
@dsl.container_component
def batch_prediction_sxs(
display_name: str,
evaluation_dataset: str,
id_columns: List[str],
task: str,
autorater_prompt_parameters: Dict[str, Dict[str, str]],
response_column_a: str,
response_column_b: str,
preprocessed_evaluation_dataset: dsl.Output[dsl.Dataset], # pylint: disable=unused-argument # pytype: disable=unsupported-operands
preprocessed_evaluation_dataset_uri: dsl.OutputPath(str), # pylint: disable=unused-argument # pytype: disable=invalid-annotation
gcp_resources: dsl.OutputPath(str), # pytype: disable=invalid-annotation
metadata: dsl.OutputPath(Dict[str, Any]), # pytype: disable=invalid-annotation
model_a: str = '',
model_b: str = '',
model_a_prompt_parameters: Dict[str, Dict[str, str]] = {},
model_b_prompt_parameters: Dict[str, Dict[str, str]] = {},
model_a_parameters: Dict[str, str] = {},
model_b_parameters: Dict[str, str] = {},
human_preference_column: str = '',
) -> dsl.ContainerSpec: # pylint: disable=g-doc-args
"""Runs up to two LLM Batch Prediction jobs side-by-side.
Args:
display_name: Display name for the batch prediction job.
evaluation_dataset: GCS or BigQuery URIs representing a dataset of prompts
and responses.
id_columns: The columns which distinguish unique evaluation examples.
task: Task to evaluate.
autorater_prompt_parameters: Map of autorater prompt template parameters to
columns or templates.
response_column_a: The column containing responses for model a.
response_column_b: The column containing responses for model b.
model_a: A fully-qualified model resource name
(`projects/{project}/locations/{location}/models/{model}@{version}`) or
publisher model resource name (`publishers/{publisher}/models/{model}`).
This parameter is optional if Model A responses are specified.
model_b: A fully-qualified model resource name
(`projects/{project}/locations/{location}/models/{model}@{version}`) or
publisher model resource name (`publishers/{publisher}/models/{model}`).
This parameter is optional if Model B responses are specified.
model_a_prompt_parameters: Map of model A prompt template parameters to
columns or templates.
model_b_prompt_parameters: Map of model B prompt template parameters to
columns or templates.
model_a_parameters: The parameters that govern the predictions from model A,
such as temperature or maximum output tokens.
model_b_parameters: The parameters that govern the predictions from model B,
such as temperature or maximum output tokens.
human_preference_column: The column containing ground truths. The default
value is an empty string if not be provided by users.
Returns:
preprocessed_evaluation_dataset: Dataset of the table containing the inputs
expected by the Arbiter.
preprocessed_evaluation_dataset_uri: URI of the table containing the inputs
expected by the Arbiter.
gcp_resources: Tracker for GCP resources created by this component.
metadata_path: Path to write the object that stores computed metrics
metadata for the task preprocess component.
"""
return gcpc_utils.build_serverless_customjob_container_spec(
project=_placeholders.PROJECT_ID_PLACEHOLDER,
location=_placeholders.LOCATION_PLACEHOLDER,
custom_job_payload=utils.build_payload(
display_name='batch_prediction_sxs',
machine_type='n1-standard-4',
image_uri=_resolve_image(),
args=[
'--', # Used to mark the start of component flags.
'batch_prediction_sxs',
f'--display_name={display_name}',
f'--evaluation_dataset={evaluation_dataset}',
(
'--id_columns='
"{{$.inputs.parameters['id_columns'].json_escape[0]}}"
),
f'--task={task}',
f'--project={_placeholders.PROJECT_ID_PLACEHOLDER}',
f'--location={_placeholders.LOCATION_PLACEHOLDER}',
f'--model_a={model_a}',
f'--model_b={model_b}',
(
'--model_a_prompt_parameters='
"{{$.inputs.parameters['model_a_prompt_parameters']"
'.json_escape[0]}}'
),
(
'--model_b_prompt_parameters='
"{{$.inputs.parameters['model_b_prompt_parameters']"
'.json_escape[0]}}'
),
(
'--autorater_prompt_parameters='
"{{$.inputs.parameters['autorater_prompt_parameters']"
'.json_escape[0]}}'
),
f'--response_column_a={response_column_a}',
f'--response_column_b={response_column_b}',
(
'--model_a_parameters='
"{{$.inputs.parameters['model_a_parameters'].json_escape[0]}}"
),
(
'--model_b_parameters='
"{{$.inputs.parameters['model_b_parameters'].json_escape[0]}}"
),
f'--human_preference_column={human_preference_column}',
f'--staging_dir={dsl.PIPELINE_ROOT_PLACEHOLDER}',
f'--preprocessed_evaluation_dataset_uri={preprocessed_evaluation_dataset_uri}',
f'--metadata_path={metadata}',
f'--gcp_resources_path={gcp_resources}',
'--executor_input={{$.json_escape[1]}}',
],
),
gcp_resources=gcp_resources,
)
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
"""A collection of constants shared across components and pipelines."""
import os

_DEFAULT_AUTOSXS_IMAGE_TAG = '20240116_0507_RC00'
_DEFAULT_AUTOSXS_IMAGE_TAG = '20240123_0507_RC00'


def get_private_image_tag() -> str:
return os.getenv('PRIVATE_IMAGE_TAG') or '20240124_0507_RC00'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,73 +16,12 @@
from typing import Any, Dict, List

from google_cloud_pipeline_components import _placeholders
from google_cloud_pipeline_components._implementation.llm import arbiter_preprocess
from google_cloud_pipeline_components._implementation.llm import autosxs_arbiter
from google_cloud_pipeline_components._implementation.llm import autosxs_metrics_computer
from google_cloud_pipeline_components._implementation.llm import function_based
from google_cloud_pipeline_components._implementation.llm import task_preprocess
from google_cloud_pipeline_components.types import artifact_types
from google_cloud_pipeline_components.v1 import batch_predict_job
from google_cloud_pipeline_components._implementation.llm import batch_prediction_sxs
from kfp import dsl


# pylint: disable=no-value-for-parameter
@dsl.pipeline(
name='predictions-pipeline',
description='Runs the prediction pipeline for one of the two SxS models.',
)
def _get_predictions(
name: str,
project: str,
location: str,
model: str,
model_parameters: Dict[str, str],
prediction_inputs: List[str],
is_model_inference: bool,
) -> str:
"""Makes predictions for a given model."""
with dsl.If(is_model_inference == True, name='Inference Required'): # pylint: disable=singleton-comparison
get_vertex_model_task = dsl.importer(
artifact_uri=(
f'https://{location}-aiplatform.googleapis.com/v1/{model}'
),
artifact_class=artifact_types.VertexModel,
metadata={'resourceName': model},
).set_display_name('Import Vertex Model Artifact')

batch_predict_task = batch_predict_job.ModelBatchPredictOp(
project=project,
location=location,
model=get_vertex_model_task.outputs['artifact'],
job_display_name=(
f'autosxs-{name}-{{$.pipeline_job_uuid}}-{{$.pipeline_task_uuid}}'
),
gcs_source_uris=prediction_inputs,
instances_format='jsonl',
predictions_format='jsonl',
gcs_destination_output_uri_prefix=(
f'{dsl.PIPELINE_ROOT_PLACEHOLDER}/{dsl.PIPELINE_TASK_ID_PLACEHOLDER}'
f'/{name}_predictions'
),
model_parameters=model_parameters,
)
prediction_uris_from_inference = function_based.get_uri(
artifact=batch_predict_task.outputs['gcs_output_directory'],
is_dir=True,
)

with dsl.Else(name='Responses Provided'): # pylint: disable=singleton-comparison
prediction_uris_inference_provided = function_based.get_empty_string()

prediction_uris = dsl.OneOf(
prediction_uris_from_inference.output,
prediction_uris_inference_provided.output,
)

# We can't directly output dsl.OneOf, so we need to use identity.
return function_based.identity(x=prediction_uris).output


# pylint: disable=dangerous-default-value,g-bare-generic,unused-argument
@dsl.pipeline(
name='autosxs-template',
Expand Down Expand Up @@ -132,72 +71,24 @@ def autosxs_pipeline(
experimental_args: Experimentally released arguments. Subject to change.
"""
# fmt: on
prediction_inputs_a = task_preprocess.task_preprocess(
arbiter_input = batch_prediction_sxs.batch_prediction_sxs(
display_name='autosxs-{{$.pipeline_job_uuid}}-{{$.pipeline_task_uuid}}',
evaluation_dataset=evaluation_dataset,
task=task,
model_prompt_parameters=model_a_prompt_parameters,
response_column=response_column_a,
human_preference_column=human_preference_column,
id_columns=id_columns,
).set_display_name('Preprocess Model A Inputs')

prediction_inputs_b = task_preprocess.task_preprocess(
evaluation_dataset=evaluation_dataset,
task=task,
model_prompt_parameters=model_b_prompt_parameters,
response_column=response_column_b,
human_preference_column=human_preference_column,
id_columns=id_columns,
).set_display_name('Preprocess Model B Inputs')

is_model_a_inference = function_based.get_usage_metric(
metadata=prediction_inputs_a.outputs['metadata'],
key='is_model_inference',
).set_display_name('Read is_model_a_inference')

is_model_b_inference = function_based.get_usage_metric(
metadata=prediction_inputs_b.outputs['metadata'],
key='is_model_inference',
).set_display_name('Read is_model_b_inference')

inferrer_a = _get_predictions(
name='A',
project=project,
location=location,
model=model_a,
model_parameters=model_a_parameters,
prediction_inputs=prediction_inputs_a.outputs['prediction_inputs'],
is_model_inference=is_model_a_inference.output,
).set_display_name('Model A Responses')

inferrer_b = _get_predictions(
name='B',
project=project,
location=location,
model=model_b,
model_parameters=model_b_parameters,
prediction_inputs=prediction_inputs_b.outputs['prediction_inputs'],
is_model_inference=is_model_b_inference.output,
).set_display_name('Model B Responses')

arbiter_input_preprocess = arbiter_preprocess.arbiter_preprocess(
autorater_prompt_parameters=autorater_prompt_parameters,
evaluation_dataset=evaluation_dataset,
id_columns=id_columns,
prediction_uris_b=inferrer_b.output,
prediction_uris_a=inferrer_a.output,
model_a_prompt_parameters=model_a_prompt_parameters,
model_b_prompt_parameters=model_b_prompt_parameters,
task=task,
response_column_a=response_column_a,
response_column_b=response_column_b,
model_a=model_a,
model_b=model_b,
model_a_prompt_parameters=model_a_prompt_parameters,
model_b_prompt_parameters=model_b_prompt_parameters,
model_a_parameters=model_a_parameters,
model_b_parameters=model_b_parameters,
human_preference_column=human_preference_column,
is_bp_output_a=is_model_a_inference.output,
is_bp_output_b=is_model_b_inference.output,
).set_display_name('Preprocess Predictions')

).set_display_name('AutoSxS Batch Prediction')
autosxs_arbiter_task = autosxs_arbiter.autosxs_arbiter(
inference_output_uri=arbiter_input_preprocess.outputs[
inference_output_uri=arbiter_input.outputs[
'preprocessed_evaluation_dataset_uri'
],
id_columns=id_columns,
Expand All @@ -207,13 +98,7 @@ def autosxs_pipeline(
bigquery_destination_prefix=bigquery_destination_prefix,
experimental_args=experimental_args,
).set_display_name('AutoSxS Arbiter')

has_human_preference = function_based.get_usage_metric(
metadata=prediction_inputs_a.outputs['metadata'],
key='has_human_preference_column',
).set_display_name('Read has_human_preference_column')

autosxs_metrics_computer.autosxs_metrics_computer(
judgments_dir=autosxs_arbiter_task.outputs['judgments_uri'],
has_human_preference=has_human_preference.output,
human_preference_column=human_preference_column,
).set_display_name('AutoSxS Metrics')

0 comments on commit 8c7b5b2

Please sign in to comment.