Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Batch OpenAI embeddings #2715

Merged
merged 12 commits into from
Oct 12, 2023
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import os
import glob
import mlflow
import pandas as pd
import numpy as np
from pathlib import Path
from typing import List
from datasets import load_dataset

DATA_READERS = {
".csv": "csv",
".tsv": "tsv",
".parquet": "parquet",
".json": "json",
".jsonl": "json",
".arrow": "arrow",
".txt": "text",
}


def init():
global model
global output_file
global task_name
global text_column

# AZUREML_MODEL_DIR is the path where the model is located.
# If the model is MLFlow, you don't need to indicate further.
model_path = glob.glob(os.environ["AZUREML_MODEL_DIR"] + "/*/")[0]
# AZUREML_BI_TEXT_COLUMN is an environment variable you can use
# to indicate over which column you want to run the model on. It can
# used only if the model has one single input.
text_column = os.environ.get("AZUREML_BI_TEXT_COLUMN", None)
santiagxf marked this conversation as resolved.
Show resolved Hide resolved

model = mlflow.pyfunc.load_model(model_path)
model_info = mlflow.models.get_model_info(model_path)

if not mlflow.openai.FLAVOR_NAME in model_info.flavors:
raise ValueError(
"The indicated model doesn't have an OpenAI flavor on it. Use "
"``mlflow.openai.log_model`` to log OpenAI models."
)

if text_column:
if (
model.metadata
and model.metadata.signature
and len(model.metadata.signature.inputs) > 1
):
raise ValueError(
"The model requires more than 1 input column to run. You can't use "
"AZUREML_BI_TEXT_COLUMN to indicate which column to send to the model. Format your "
f"data with columns {model.metadata.signature.inputs.input_names()} instead."
)

task_name = model._model_impl.model["task"]
output_path = os.environ["AZUREML_BI_OUTPUT_PATH"]
output_file = os.path.join(output_path, f"{task_name}.jsonl")


def run(mini_batch: List[str]):
if mini_batch:
filtered_files = filter(lambda x: Path(x).suffix in DATA_READERS, mini_batch)
results = []

for file in filtered_files:
data_format = Path(file).suffix
data = load_dataset(DATA_READERS[data_format], data_files={"data": file})[
"data"
].data.to_pandas()
if text_column:
data = data.loc[[text_column]]
scores = model.predict(data)
results.append(
pd.DataFrame(
{
"file": np.repeat(Path(file).name, len(scores)),
"row": range(0, len(scores)),
task_name: scores,
}
)
)

pd.concat(results, axis="rows").to_json(
santiagxf marked this conversation as resolved.
Show resolved Hide resolved
output_file, orient="records", mode="a", lines=True
)

return mini_batch
santiagxf marked this conversation as resolved.
Show resolved Hide resolved
13,269 changes: 13,269 additions & 0 deletions cli/endpoints/batch/deploy-models/openai-embeddings/data/billsum-0.csv

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
set -e

# <set_variables>
export ENDPOINT_NAME="<YOUR_ENDPOINT_NAME>"
# </set_variables>

# <set_openai>
OPENAI_API_BASE="https://<deployment>.openai.azure.com/"
# </set_openai>

# <name_endpoint>
ENDPOINT_NAME="text-davinci-002"
# </name_endpoint>

# The following code ensures the created deployment has a unique name
ENDPOINT_SUFIX=$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w ${1:-5} | head -n 1)
ENDPOINT_NAME="$ENDPOINT_NAME-$ENDPOINT_SUFIX"

echo "Register the model"
# <register_model>
MODEL_NAME='text-embedding-ada-002'
az ml model create --name $MODEL_NAME --path "model"
# </register_model>

echo "Creating batch endpoint $ENDPOINT_NAME"
# <create_endpoint>
az ml batch-endpoint create -n $ENDPOINT_NAME -f endpoint.yml
# </create_endpoint>

echo "Creating batch deployment $DEPLOYMENT_NAME for endpoint $ENDPOINT_NAME"
# <create_deployment>
az ml batch-deployment create --file deployment.yml \
--endpoint-name $ENDPOINT_NAME \
--set-default \
--set settings.environment_variables.OPENAI_API_BASE=$OPENAI_API_BASE
SagarikaKengunte marked this conversation as resolved.
Show resolved Hide resolved
# </create_deployment>

echo "Invoking batch endpoint"
# <start_batch_scoring_job>
JOB_NAME=$(az ml batch-endpoint invoke --name $ENDPOINT_NAME --input data --query name -o tsv)
# </start_batch_scoring_job>

echo "Showing job detail"
# <show_job_in_studio>
az ml job show -n $JOB_NAME --web
# </show_job_in_studio>

echo "Stream job logs to console"
# <stream_job_logs>
az ml job stream -n $JOB_NAME
# </stream_job_logs>

# <check_job_status>
STATUS=$(az ml job show -n $JOB_NAME --query status -o tsv)
echo $STATUS
if [[ $STATUS == "Completed" ]]
then
echo "Job completed"
elif [[ $STATUS == "Failed" ]]
then
echo "Job failed"
exit 1
else
echo "Job status not failed or completed"
exit 2
fi
# </check_job_status>

echo "Download scores to local path"
# <download_outputs>
az ml job download --name $JOB_NAME --output-name score --download-path ./
# </download_outputs>

echo "Delete resources"
# <delete_endpoint>
az ml batch-endpoint delete --name $ENDPOINT_NAME --yes
# </delete_endpoint>
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
$schema: https://azuremlschemas.azureedge.net/latest/batchDeployment.schema.json
endpoint_name: text-embedding-ada-qwerty
name: default
description: The default deployment for generating embeddings
type: model
model: azureml:text-embedding-ada-002@latest
environment:
name: batch-openai-mlflow
image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest
conda_file: environment/conda.yaml
code_configuration:
code: code
scoring_script: batch_driver.py
compute: azureml:batch-cluster-lp
resources:
instance_count: 1
settings:
max_concurrency_per_instance: 1
mini_batch_size: 1
output_action: summary_only
retry_settings:
max_retries: 1
timeout: 9999
logging_level: info
environment_variables:
OPENAI_API_TYPE: azure_ad
OPENAI_API_BASE: $OPENAI_API_BASE
OPENAI_API_VERSION: 2023-03-15-preview
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
$schema: https://azuremlschemas.azureedge.net/latest/batchEndpoint.schema.json
name: text-embedding-ada-qwerty
description: An endpoint to generate embeddings in batch for the ADA-002 model from OpenAI
auth_mode: aad_token
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
channels:
- conda-forge
dependencies:
- python=3.8.5
- pip<=23.2.1
- pip:
- openai==0.27.8
- requests==2.31.0
- tenacity==8.2.2
- tiktoken==0.4.0
- azureml-core
- azure-identity
- datasets
- mlflow
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json
name: batch-openai-mlflow
image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04
conda_file: conda.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
flavors:
openai:
code: null
data: model.yaml
openai_version: 0.27.8
python_function:
data: model.yaml
env:
conda: conda.yaml
virtualenv: python_env.yaml
loader_module: mlflow.openai
python_version: 3.8.5
mlflow_version: 2.5.1.dev0
model_uuid: b9a39a71f54e41efbd83b8307294b4d8
signature:
inputs: '[{"type": "string"}]'
santiagxf marked this conversation as resolved.
Show resolved Hide resolved
outputs: '[{"type": "tensor", "tensor-spec": {"dtype": "float64", "shape": [-1]}}]'
params: '[{"name": "batch_size", "dtype": "long", "default": 16, "shape": null}]'
utc_time_created: '2023-08-15 05:08:52.461694'
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
channels:
- conda-forge
dependencies:
- python=3.8.5
- pip<=23.2.1
- pip:
- mlflow==2.5.0
- gunicorn==20.1.0
- numpy==1.24.4
- openai==0.27.8
- requests==2.31.0
- tenacity==8.2.2
- tiktoken==0.4.0
name: mlflow-env
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
engine: text-embedding-ada-002
model: text-embedding-ada-002
task: embeddings
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
python: 3.8.5
build_dependencies:
- pip==23.2.1
- setuptools
- wheel==0.38.4
dependencies:
- -r requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
mlflow==2.7.0
gunicorn==20.1.0
numpy==1.24.4
openai==0.27.8
requests==2.31.0
tenacity==8.2.2
tiktoken==0.4.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import os
import glob
import mlflow
import pandas as pd
import numpy as np
from pathlib import Path
from typing import List
from datasets import load_dataset

DATA_READERS = {
".csv": "csv",
".tsv": "tsv",
".parquet": "parquet",
".json": "json",
".jsonl": "json",
".arrow": "arrow",
".txt": "text",
}


def init():
global model
global output_file
global task_name
global text_column

# AZUREML_MODEL_DIR is the path where the model is located.
# If the model is MLFlow, you don't need to indicate further.
model_path = glob.glob(os.environ["AZUREML_MODEL_DIR"] + "/*/")[0]
# AZUREML_BI_TEXT_COLUMN is an environment variable you can use
# to indicate over which column you want to run the model on. It can
# used only if the model has one single input.
text_column = os.environ.get("AZUREML_BI_TEXT_COLUMN", None)

model = mlflow.pyfunc.load_model(model_path)
model_info = mlflow.models.get_model_info(model_path)

if not mlflow.openai.FLAVOR_NAME in model_info.flavors:
raise ValueError(
"The indicated model doesn't have an OpenAI flavor on it. Use "
"``mlflow.openai.log_model`` to log OpenAI models."
)

if text_column:
if (
model.metadata
and model.metadata.signature
and len(model.metadata.signature.inputs) > 1
):
raise ValueError(
"The model requires more than 1 input column to run. You can't use "
"AZUREML_BI_TEXT_COLUMN to indicate which column to send to the model. Format your "
f"data with columns {model.metadata.signature.inputs.input_names()} instead."
)

task_name = model._model_impl.model["task"]
output_path = os.environ["AZUREML_BI_OUTPUT_PATH"]
output_file = os.path.join(output_path, f"{task_name}.jsonl")


def run(mini_batch: List[str]):
if mini_batch:
filtered_files = filter(lambda x: Path(x).suffix in DATA_READERS, mini_batch)
results = []

for file in filtered_files:
data_format = Path(file).suffix
data = load_dataset(DATA_READERS[data_format], data_files={"data": file})[
"data"
].data.to_pandas()
if text_column:
data = data.loc[[text_column]]
scores = model.predict(data)
results.append(
pd.DataFrame(
{
"file": np.repeat(Path(file).name, len(scores)),
"row": range(0, len(scores)),
task_name: scores,
}
)
)

pd.concat(results, axis="rows").to_json(
output_file, orient="records", mode="a", lines=True
)

return mini_batch
Loading
Loading