From 637db321f7f1517c2ff5bd9989ae9581193c9c9a Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Tue, 30 Jul 2024 23:30:49 -0700 Subject: [PATCH 01/80] Initial code migration, start the testing structure --- qa/L0_openai/openai_tritonserver/Dockerfile | 5 + .../openai_tritonserver/app/__init__.py | 0 qa/L0_openai/openai_tritonserver/app/main.py | 516 +++++++++++ .../app/openai_protocol_types.py | 870 ++++++++++++++++++ .../openai_tritonserver/app/test_main.py | 27 + .../app/transformers_utils/__init__.py | 0 .../app/transformers_utils/tokenizer.py | 150 +++ 7 files changed, 1568 insertions(+) create mode 100644 qa/L0_openai/openai_tritonserver/Dockerfile create mode 100644 qa/L0_openai/openai_tritonserver/app/__init__.py create mode 100644 qa/L0_openai/openai_tritonserver/app/main.py create mode 100644 qa/L0_openai/openai_tritonserver/app/openai_protocol_types.py create mode 100644 qa/L0_openai/openai_tritonserver/app/test_main.py create mode 100644 qa/L0_openai/openai_tritonserver/app/transformers_utils/__init__.py create mode 100644 qa/L0_openai/openai_tritonserver/app/transformers_utils/tokenizer.py diff --git a/qa/L0_openai/openai_tritonserver/Dockerfile b/qa/L0_openai/openai_tritonserver/Dockerfile new file mode 100644 index 0000000000..898f68bf95 --- /dev/null +++ b/qa/L0_openai/openai_tritonserver/Dockerfile @@ -0,0 +1,5 @@ +ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.07-vllm-python-py3 +FROM ${BASE_IMAGE} +# TODO: This should be installed in Triton container by default IMO +RUN pip install /opt/tritonserver/python/*.whl +RUN pip install "fastapi==0.111.1" "pytest==8.1.1" diff --git a/qa/L0_openai/openai_tritonserver/app/__init__.py b/qa/L0_openai/openai_tritonserver/app/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/qa/L0_openai/openai_tritonserver/app/main.py b/qa/L0_openai/openai_tritonserver/app/main.py new file mode 100644 index 0000000000..3881564480 --- /dev/null +++ b/qa/L0_openai/openai_tritonserver/app/main.py @@ -0,0 +1,516 @@ +# generated by fastapi-codegen: +# filename: openai_trimmed.yml +# timestamp: 2024-05-05T21:52:36+00:00 + +from __future__ import annotations + +import argparse +import os +import time +import uuid +from contextlib import asynccontextmanager +from typing import Optional, Union + +import numpy +import tritonserver +import uvicorn +from fastapi import FastAPI, HTTPException, Request +from fastapi.responses import Response, StreamingResponse + +# TODO: transformer utils needed? +from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast + +from .openai_protocol_types import ( + ChatCompletionChoice, + ChatCompletionFinishReason, + ChatCompletionResponseMessage, + ChatCompletionStreamingResponseChoice, + ChatCompletionStreamResponseDelta, + Choice, + CreateChatCompletionRequest, + CreateChatCompletionResponse, + CreateChatCompletionStreamResponse, + CreateCompletionRequest, + CreateCompletionResponse, + FinishReason, + ListModelsResponse, + Model, + ObjectType, +) +from .transformers_utils.tokenizer import get_tokenizer + +# TODO: Remove +SUPPORTED_BACKENDS: set = {"vllm", "tensorrtllm"} +KNOWN_MODELS = {"gpt2": "hf:gpt2"} + +# TODO: What is this for? +OWNED_BY = "ACME" +TIMEOUT_KEEP_ALIVE = 5 # seconds + +server: tritonserver.Server +model: tritonserver.Model +model_source_name: str +model_create_time: int +backend: str +tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast] +create_inference_request = None + + +# TODO: Re-organize helpers +def load_model(server): + model = None + backends = [] + tokenizer = None + model_source_name = None + for model_name, version in server.models().keys(): + if version != -1: + continue + current_model = server.load(model_name) + backends.append(current_model.config()["backend"]) + if model_name in KNOWN_MODELS.keys(): + model = current_model + model_source_name = KNOWN_MODELS[model_name].replace("hf:", "") + tokenizer = get_tokenizer(model_source_name) + if model and tokenizer: + for backend in backends: + if backend in SUPPORTED_BACKENDS: + return model, int(time.time()), backend, tokenizer, model_source_name + return None, None, None, None, None + + +def init_tritonserver(): + # TODO: How to pass arguments to server here? + model_repository = os.environ.get( + "TRITON_MODEL_REPOSITORY", "/opt/tritonserver/models" + ) + + print("Starting Triton Server Core...") + server = tritonserver.Server( + model_repository=model_repository, + log_verbose=1, + log_info=True, + log_warn=True, + log_error=True, + model_control_mode=tritonserver.ModelControlMode.EXPLICIT, + ).start(wait_until_ready=True) + + # TODO: Cleanup + # print("Loading Model...\n\n") + + # model, model_create_time, backend, tokenizer, model_source_name = load_model(server) + + # if not (model and backend and tokenizer and model_create_time): + # raise Exception("Unknown Model") + + # print(f"\n\nModel: {model.name} Loaded with Backend: {backend}\n\n") + + # if backend == "vllm": + # create_inference_request = create_vllm_inference_request + # elif backend == "tensorrtllm": + # create_inference_request = create_trtllm_inference_request + + return server + + +@asynccontextmanager +async def lifespan(app: FastAPI): + print("Starting FastAPI app lifespan...") + # Start the tritonserver on FastAPI app startup + try: + print("Starting Triton Inference Server...") + app.server = init_tritonserver() + except Exception as e: + print(f"Failed to start Triton Inference Server: {e}") + app.server = None + + yield + + # Cleanup the tritonserver on FastAPI app shutdown + print("Shutting down FastAPI app lifespan...") + if app.server: + print("Shutting down Triton Inference Server...") + app.server.stop() + + +app = FastAPI( + title="OpenAI API", + description="The OpenAI REST API. Please see https://platform.openai.com/docs/api-reference for more details.", + version="2.0.0", + termsOfService="https://openai.com/policies/terms-of-use", + contact={"name": "OpenAI Support", "url": "https://help.openai.com/"}, + license={ + "name": "MIT", + "url": "https://github.com/openai/openai-openapi/blob/master/LICENSE", + }, + servers=[{"url": "https://api.openai.com/v1"}], + lifespan=lifespan, +) + + +# TODO: use router? +@app.get("/health") +def health() -> Response: + if not app.server: + raise HTTPException( + status_code=400, + detail="Triton Inference Server failed to start successfully.", + ) + + if not app.server.live(): + raise HTTPException( + status_code=400, detail="Triton Inference Server is not live." + ) + + return Response(status_code=200) + + +def get_output(response): + if "text_output" in response.outputs: + try: + return response.outputs["text_output"].to_string_array()[0] + except: + return str(response.outputs["text_output"].to_bytes_array()[0]) + return None + + +def streaming_chat_completion_response(request_id, created, model, role, responses): + # first chunk + + choice = ChatCompletionStreamingResponseChoice( + index=0, + delta=ChatCompletionStreamResponseDelta( + role=role, content=None, function_call=None + ), + logprobs=None, + finish_reason=None, + ) + chunk = CreateChatCompletionStreamResponse( + id=request_id, + choices=[choice], + created=created, + model=model, + system_fingerprint=None, + object=ObjectType.chat_completion_chunk, + ) + yield f"data: {chunk.json(exclude_unset=True)}\n\n" + + for response in responses: + text = get_output(response) + + choice = ChatCompletionStreamingResponseChoice( + index=0, + delta=ChatCompletionStreamResponseDelta( + role=None, content=text, function_call=None + ), + logprobs=None, + finish_reason=ChatCompletionFinishReason.stop if response.final else None, + ) + + chunk = CreateChatCompletionStreamResponse( + id=request_id, + choices=[choice], + created=created, + model=model, + system_fingerprint=None, + object=ObjectType.chat_completion_chunk, + ) + + yield f"data: {chunk.json(exclude_unset=True)}\n\n" + + yield "data: [DONE]\n\n" + + +def create_vllm_inference_request( + model, prompt, request: CreateChatCompletionRequest | CreateCompletionRequest +): + inputs = {} + sampling_parameters = request.copy( + exclude={"model", "stream", "messages", "prompt", "echo"}, + ).model_dump(exclude_none=True) + inputs["text_input"] = [prompt] + inputs["stream"] = [request.stream] + exclude_input_in_output = True + echo = getattr(request, "echo", None) + if echo: + exclude_input_in_output = not echo + inputs["exclude_input_in_output"] = [exclude_input_in_output] + return model.create_request(inputs=inputs, parameters=sampling_parameters) + + +def create_trtllm_inference_request( + model, prompt, request: CreateChatCompletionRequest | CreateCompletionRequest +): + inputs = {} + if model.name == "llama-3-8b-instruct": + inputs["stop_words"] = [["<|eot_id|>", "<|end_of_text|>"]] + inputs["text_input"] = [[prompt]] + inputs["stream"] = [[request.stream]] + if request.max_tokens: + inputs["max_tokens"] = numpy.int32([[request.max_tokens]]) + if request.stop: + if isinstance(request.stop, str): + request.stop = [request.stop] + inputs["stop_words"] = [request.stop] + if request.top_p: + inputs["top_p"] = numpy.float32([[request.top_p]]) + if request.frequency_penalty: + inputs["frequency_penalty"] = numpy.float32([[request.frequency_penalty]]) + if request.presence_penalty: + inputs["presence_penalty":] = numpy.int32([[request.presence_penalty]]) + if request.seed: + inputs["random_seed"] = numpy.uint64([[request.seed]]) + if request.temperature: + inputs["temperature"] = numpy.float32([[request.temperature]]) + + return model.create_request(inputs=inputs) + + +@app.post( + "/v1/chat/completions", response_model=CreateChatCompletionResponse, tags=["Chat"] +) +def create_chat_completion( + request: CreateChatCompletionRequest, +) -> CreateChatCompletionResponse | StreamingResponse: + """ + Creates a model response for the given chat conversation. + """ + + if not model or not tokenizer or not create_inference_request: + raise Exception("Unknown Model") + + add_generation_prompt_default = True + default_role = "assistant" + + if request.model != model.name and request.model != model_source_name: + raise HTTPException(status_code=404, detail=f"Unknown model: {request.model}") + + if request.n and request.n > 1: + raise HTTPException(status_code=400, detail=f"Only single choice is supported") + + conversation = [ + {"role": str(message.role), "content": str(message.content)} + for message in request.messages + ] + + prompt = tokenizer.apply_chat_template( + conversation=conversation, + tokenize=False, + add_generation_prompt=add_generation_prompt_default, + ) + + request_id = f"cmpl-{uuid.uuid1()}" + created = int(time.time()) + + responses = model.infer(create_inference_request(model, prompt, request)) + + if request.stream: + return StreamingResponse( + streaming_chat_completion_response( + request_id, created, request.model, conversation[-1]["role"], responses + ) + ) + + response = list(responses)[0] + + text = get_output(response) + + return CreateChatCompletionResponse( + id=request_id, + choices=[ + ChatCompletionChoice( + index=0, + message=ChatCompletionResponseMessage( + content=text, role=default_role, function_call=None + ), + logprobs=None, + finish_reason=ChatCompletionFinishReason.stop, + ) + ], + created=created, + model=request.model, + system_fingerprint=None, + object=ObjectType.chat_completion, + ) + + +def streaming_completion_response(request_id, created, model, responses): + for response in responses: + text = get_output(response) + + choice = Choice( + finish_reason=FinishReason.stop if response.final else None, + index=0, + logprobs=None, + text=text, + ) + response = CreateCompletionResponse( + id=request_id, + choices=[choice], + system_fingerprint=None, + object=ObjectType.text_completion, + created=created, + model=model, + ) + + yield f"data: {response.json(exclude_unset=True)}\n\n" + yield "data: [DONE]\n\n" + + +@app.post( + "/v1/completions", response_model=CreateCompletionResponse, tags=["Completions"] +) +def create_completion( + request: CreateCompletionRequest, raw_request: Request +) -> CreateCompletionResponse | StreamingResponse: + """ + Creates a completion for the provided prompt and parameters. + """ + + if not model or not tokenizer or not create_inference_request: + raise Exception("Unknown Model") + + if request.suffix is not None: + raise HTTPException(status_code=400, detail="suffix is not currently supported") + + if request.model != model.name and request.model != model_source_name: + raise HTTPException(status_code=404, detail=f"Unknown model: {request.model}") + + if request.prompt is None: + request.prompt = "<|endoftext|>" + + # Currently only support single string as input + if not isinstance(request.prompt, str): + raise HTTPException( + status_code=400, detail="only single string input is supported" + ) + + if request.logit_bias is not None or request.logprobs is not None: + raise HTTPException( + status_code=400, detail="logit bias and log probs not supported" + ) + + request_id = f"cmpl-{uuid.uuid1()}" + created = int(time.time()) + + responses = model.infer(create_inference_request(model, request.prompt, request)) + if request.stream: + return StreamingResponse( + streaming_completion_response(request_id, created, model.name, responses) + ) + response = list(responses)[0] + text = get_output(response) + + choice = Choice( + finish_reason=FinishReason.stop if response.final else None, + index=0, + logprobs=None, + text=text, + ) + return CreateCompletionResponse( + id=request_id, + choices=[choice], + system_fingerprint=None, + object=ObjectType.text_completion, + created=created, + model=model.name, + ) + + +@app.get("/metrics") +def metrics() -> str: + return server.metrics() + + +@app.get("/v1/models", response_model=ListModelsResponse, tags=["Models"]) +def list_models() -> ListModelsResponse: + """ + Lists the currently available models, and provides basic information about each one such as the owner and availability. + """ + + model_list = [ + Model( + id=model.name, + created=model_create_time, + object=ObjectType.model, + owned_by=OWNED_BY, + ), + Model( + id=model_source_name, + created=model_create_time, + object=ObjectType.model, + owned_by=OWNED_BY, + ), + ] + + return ListModelsResponse(object=ObjectType.list, data=model_list) + + +@app.get("/v1/models/{model_name}", response_model=Model, tags=["Models"]) +def retrieve_model(model_name: str) -> Model: + """ + Retrieves a model instance, providing basic information about the model such as the owner and permissioning. + """ + + if model_name == model.name: + return Model( + id=model.name, + created=model_create_time, + object=ObjectType.model, + owned_by=OWNED_BY, + ) + + if model_name == model_source_name: + return Model( + id=model_source_name, + created=model_create_time, + object=ObjectType.model, + owned_by=OWNED_BY, + ) + + raise HTTPException(status_code=404, detail=f"Unknown model: {model_name}") + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Triton OpenAI Compatible RESTful API server." + ) + parser.add_argument("--host", type=str, default=None, help="host name") + parser.add_argument("--port", type=int, default=8000, help="port number") + parser.add_argument( + "--uvicorn-log-level", + type=str, + default="info", + choices=["debug", "info", "warning", "error", "critical", "trace"], + help="log level for uvicorn", + ) + parser.add_argument( + "--response-role", type=str, default="assistant", help="The role name to return" + ) + + parser.add_argument( + "--tritonserver-log-level", + type=int, + default=0, + help="The tritonserver log level", + ) + + parser.add_argument( + "--model-repository", + type=str, + default="/workspace/llm-models", + help="model repository", + ) + return parser.parse_args() + + +if __name__ == "__main__": + # TODO: Cleanup + args = parse_args() + + uvicorn.run( + app, + host=args.host, + port=args.port, + log_level=args.uvicorn_log_level, + timeout_keep_alive=TIMEOUT_KEEP_ALIVE, + ) diff --git a/qa/L0_openai/openai_tritonserver/app/openai_protocol_types.py b/qa/L0_openai/openai_tritonserver/app/openai_protocol_types.py new file mode 100644 index 0000000000..5082bbea8d --- /dev/null +++ b/qa/L0_openai/openai_tritonserver/app/openai_protocol_types.py @@ -0,0 +1,870 @@ +# generated by fastapi-codegen: +# filename: api-spec/openai_trimmed.yml +# timestamp: 2024-05-05T21:52:36+00:00 + +from __future__ import annotations + +from enum import Enum +from typing import Any, Dict, List, Optional, Union + +from pydantic import AnyUrl, BaseModel, Extra, Field, RootModel, confloat, conint + + +class Error(BaseModel): + code: str + message: str + param: str + type: str + + +class ErrorResponse(BaseModel): + error: Error + + +class Object(Enum): + list = "list" + + +class DeleteModelResponse(BaseModel): + id: str + deleted: bool + object: str + + +class Model1(Enum): + gpt_3_5_turbo_instruct = "gpt-3.5-turbo-instruct" + davinci_002 = "davinci-002" + babbage_002 = "babbage-002" + + +class PromptItem(RootModel): + root: List[Any] + + +class CreateCompletionRequest(BaseModel): + model: Union[str, Model1] = Field( + ..., + description="ID of the model to use. You can use the [List models](/docs/api-reference/models/list) API to see all of your available models, or see our [Model overview](/docs/models/overview) for descriptions of them.\n", + ) + prompt: Union[str, List[str], List[int], List[PromptItem]] = Field( + ..., + description="The prompt(s) to generate completions for, encoded as a string, array of strings, array of tokens, or array of token arrays.\n\nNote that <|endoftext|> is the document separator that the model sees during training, so if a prompt is not specified the model will generate as if from the beginning of a new document.\n", + ) + best_of: Optional[conint(ge=0, le=20)] = Field( + 1, + description='Generates `best_of` completions server-side and returns the "best" (the one with the highest log probability per token). Results cannot be streamed.\n\nWhen used with `n`, `best_of` controls the number of candidate completions and `n` specifies how many to return – `best_of` must be greater than `n`.\n\n**Note:** Because this parameter generates many completions, it can quickly consume your token quota. Use carefully and ensure that you have reasonable settings for `max_tokens` and `stop`.\n', + ) + echo: Optional[bool] = Field( + False, description="Echo back the prompt in addition to the completion\n" + ) + frequency_penalty: Optional[confloat(ge=-2.0, le=2.0)] = Field( + 0, + description="Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.\n\n[See more information about frequency and presence penalties.](/docs/guides/text-generation/parameter-details)\n", + ) + logit_bias: Optional[Dict[str, int]] = Field( + None, + description='Modify the likelihood of specified tokens appearing in the completion.\n\nAccepts a JSON object that maps tokens (specified by their token ID in the GPT tokenizer) to an associated bias value from -100 to 100. You can use this [tokenizer tool](/tokenizer?view=bpe) to convert text to token IDs. Mathematically, the bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model, but values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should result in a ban or exclusive selection of the relevant token.\n\nAs an example, you can pass `{"50256": -100}` to prevent the <|endoftext|> token from being generated.\n', + ) + logprobs: Optional[conint(ge=0, le=5)] = Field( + None, + description="Include the log probabilities on the `logprobs` most likely output tokens, as well the chosen tokens. For example, if `logprobs` is 5, the API will return a list of the 5 most likely tokens. The API will always return the `logprob` of the sampled token, so there may be up to `logprobs+1` elements in the response.\n\nThe maximum value for `logprobs` is 5.\n", + ) + max_tokens: Optional[conint(ge=0)] = Field( + 16, + description="The maximum number of [tokens](/tokenizer) that can be generated in the completion.\n\nThe token count of your prompt plus `max_tokens` cannot exceed the model's context length. [Example Python code](https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken) for counting tokens.\n", + example=16, + ) + n: Optional[conint(ge=1, le=128)] = Field( + 1, + description="How many completions to generate for each prompt.\n\n**Note:** Because this parameter generates many completions, it can quickly consume your token quota. Use carefully and ensure that you have reasonable settings for `max_tokens` and `stop`.\n", + example=1, + ) + presence_penalty: Optional[confloat(ge=-2.0, le=2.0)] = Field( + 0, + description="Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.\n\n[See more information about frequency and presence penalties.](/docs/guides/text-generation/parameter-details)\n", + ) + seed: Optional[conint(ge=-9223372036854775808, le=9223372036854775807)] = Field( + None, + description="If specified, our system will make a best effort to sample deterministically, such that repeated requests with the same `seed` and parameters should return the same result.\n\nDeterminism is not guaranteed, and you should refer to the `system_fingerprint` response parameter to monitor changes in the backend.\n", + ) + stop: Optional[Union[str, List[str]]] = Field( + None, + description="Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence.\n", + ) + stream: Optional[bool] = Field( + False, + description="Whether to stream back partial progress. If set, tokens will be sent as data-only [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format) as they become available, with the stream terminated by a `data: [DONE]` message. [Example Python code](https://cookbook.openai.com/examples/how_to_stream_completions).\n", + ) + suffix: Optional[str] = Field( + None, + description="The suffix that comes after a completion of inserted text.\n\nThis parameter is only supported for `gpt-3.5-turbo-instruct`.\n", + example="test.", + ) + temperature: Optional[confloat(ge=0.0, le=2.0)] = Field( + 1, + description="What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.\n\nWe generally recommend altering this or `top_p` but not both.\n", + example=1, + ) + top_p: Optional[confloat(ge=0.0, le=1.0)] = Field( + 1, + description="An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.\n\nWe generally recommend altering this or `temperature` but not both.\n", + example=1, + ) + user: Optional[str] = Field( + None, + description="A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse. [Learn more](/docs/guides/safety-best-practices/end-user-ids).\n", + example="user-1234", + ) + + +class FinishReason(Enum): + stop = "stop" + length = "length" + content_filter = "content_filter" + + +class Logprobs(BaseModel): + text_offset: Optional[List[int]] = None + token_logprobs: Optional[List[float]] = None + tokens: Optional[List[str]] = None + top_logprobs: Optional[List[Dict[str, float]]] = None + + +class Choice(BaseModel): + finish_reason: FinishReason | None = Field( + ..., + description="The reason the model stopped generating tokens. This will be `stop` if the model hit a natural stop point or a provided stop sequence,\n`length` if the maximum number of tokens specified in the request was reached,\nor `content_filter` if content was omitted due to a flag from our content filters.\n", + ) + index: int + logprobs: Logprobs | None + text: str + + +class Object1(Enum): + text_completion = "text_completion" + + +class Type(Enum): + image_url = "image_url" + + +class Detail(Enum): + auto = "auto" + low = "low" + high = "high" + + +class ImageUrl(BaseModel): + url: AnyUrl = Field( + ..., description="Either a URL of the image or the base64 encoded image data." + ) + detail: Optional[Detail] = Field( + "auto", + description="Specifies the detail level of the image. Learn more in the [Vision guide](/docs/guides/vision/low-or-high-fidelity-image-understanding).", + ) + + +class ChatCompletionRequestMessageContentPartImage(BaseModel): + type: Type = Field(..., description="The type of the content part.") + image_url: ImageUrl + + +class Type1(Enum): + text = "text" + + +class ChatCompletionRequestMessageContentPartText(BaseModel): + type: Type1 = Field(..., description="The type of the content part.") + text: str = Field(..., description="The text content.") + + +class Role(Enum): + system = "system" + + def __str__(self): + return self.name + + +class ChatCompletionRequestSystemMessage(BaseModel): + content: str = Field(..., description="The contents of the system message.") + role: Role = Field( + ..., description="The role of the messages author, in this case `system`." + ) + name: Optional[str] = Field( + None, + description="An optional name for the participant. Provides the model information to differentiate between participants of the same role.", + ) + + +class Role1(Enum): + user = "user" + + def __str__(self): + return self.name + + +class Role2(Enum): + assistant = "assistant" + + def __str__(self): + return self.name + + +class FunctionCall(BaseModel): + arguments: str = Field( + ..., + description="The arguments to call the function with, as generated by the model in JSON format. Note that the model does not always generate valid JSON, and may hallucinate parameters not defined by your function schema. Validate the arguments in your code before calling your function.", + ) + name: str = Field(..., description="The name of the function to call.") + + +class Role3(Enum): + tool = "tool" + + def __str__(self): + return self.name + + +class ChatCompletionRequestToolMessage(BaseModel): + role: Role3 = Field( + ..., description="The role of the messages author, in this case `tool`." + ) + content: str = Field(..., description="The contents of the tool message.") + tool_call_id: str = Field( + ..., description="Tool call that this message is responding to." + ) + + +class Role4(Enum): + function = "function" + + def __str__(self): + return self.name + + +class ChatCompletionRequestFunctionMessage(BaseModel): + role: Role4 = Field( + ..., description="The role of the messages author, in this case `function`." + ) + content: str = Field(..., description="The contents of the function message.") + name: str = Field(..., description="The name of the function to call.") + + +class FunctionParameters(BaseModel): + pass + + class Config: + extra = Extra.allow + + +class ChatCompletionFunctions(BaseModel): + description: Optional[str] = Field( + None, + description="A description of what the function does, used by the model to choose when and how to call the function.", + ) + name: str = Field( + ..., + description="The name of the function to be called. Must be a-z, A-Z, 0-9, or contain underscores and dashes, with a maximum length of 64.", + ) + parameters: Optional[FunctionParameters] = None + + +class ChatCompletionFunctionCallOption(BaseModel): + name: str = Field(..., description="The name of the function to call.") + + +class Type2(Enum): + function = "function" + + +class FunctionObject(BaseModel): + description: Optional[str] = Field( + None, + description="A description of what the function does, used by the model to choose when and how to call the function.", + ) + name: str = Field( + ..., + description="The name of the function to be called. Must be a-z, A-Z, 0-9, or contain underscores and dashes, with a maximum length of 64.", + ) + parameters: Optional[FunctionParameters] = None + + +class ChatCompletionToolChoiceOption1(Enum): + none = "none" + auto = "auto" + required = "required" + + +class Function(BaseModel): + name: str = Field(..., description="The name of the function to call.") + + +class ChatCompletionNamedToolChoice(BaseModel): + type: Type2 = Field( + ..., + description="The type of the tool. Currently, only `function` is supported.", + ) + function: Function + + +class Function1(BaseModel): + name: str = Field(..., description="The name of the function to call.") + arguments: str = Field( + ..., + description="The arguments to call the function with, as generated by the model in JSON format. Note that the model does not always generate valid JSON, and may hallucinate parameters not defined by your function schema. Validate the arguments in your code before calling your function.", + ) + + +class ChatCompletionMessageToolCall(BaseModel): + id: str = Field(..., description="The ID of the tool call.") + type: Type2 = Field( + ..., + description="The type of the tool. Currently, only `function` is supported.", + ) + function: Function1 = Field(..., description="The function that the model called.") + + +class Function2(BaseModel): + name: Optional[str] = Field(None, description="The name of the function to call.") + arguments: Optional[str] = Field( + None, + description="The arguments to call the function with, as generated by the model in JSON format. Note that the model does not always generate valid JSON, and may hallucinate parameters not defined by your function schema. Validate the arguments in your code before calling your function.", + ) + + +class ChatCompletionMessageToolCallChunk(BaseModel): + index: int + id: Optional[str] = Field(None, description="The ID of the tool call.") + type: Optional[Type2] = Field( + None, + description="The type of the tool. Currently, only `function` is supported.", + ) + function: Optional[Function2] = None + + +class ChatCompletionRole(Enum): + system = "system" + user = "user" + assistant = "assistant" + tool = "tool" + function = "function" + + +class Role5(Enum): + assistant = "assistant" + + def __str__(self): + return self.name + + +class FunctionCall2(BaseModel): + arguments: Optional[str] = Field( + None, + description="The arguments to call the function with, as generated by the model in JSON format. Note that the model does not always generate valid JSON, and may hallucinate parameters not defined by your function schema. Validate the arguments in your code before calling your function.", + ) + name: Optional[str] = Field(None, description="The name of the function to call.") + + +class Role6(Enum): + system = "system" + user = "user" + assistant = "assistant" + tool = "tool" + + def __str__(self): + return self.name + + +class ChatCompletionStreamResponseDelta(BaseModel): + content: Optional[str] = Field( + None, description="The contents of the chunk message." + ) + function_call: Optional[FunctionCall2] = Field( + None, + description="Deprecated and replaced by `tool_calls`. The name and arguments of a function that should be called, as generated by the model.", + ) + tool_calls: Optional[List[ChatCompletionMessageToolCallChunk]] = None + role: Optional[str] = Field( + None, description="The role of the author of this message." + ) + + +class Model2(Enum): + gpt_4_turbo = "gpt-4-turbo" + gpt_4_turbo_2024_04_09 = "gpt-4-turbo-2024-04-09" + gpt_4_0125_preview = "gpt-4-0125-preview" + gpt_4_turbo_preview = "gpt-4-turbo-preview" + gpt_4_1106_preview = "gpt-4-1106-preview" + gpt_4_vision_preview = "gpt-4-vision-preview" + gpt_4 = "gpt-4" + gpt_4_0314 = "gpt-4-0314" + gpt_4_0613 = "gpt-4-0613" + gpt_4_32k = "gpt-4-32k" + gpt_4_32k_0314 = "gpt-4-32k-0314" + gpt_4_32k_0613 = "gpt-4-32k-0613" + gpt_3_5_turbo = "gpt-3.5-turbo" + gpt_3_5_turbo_16k = "gpt-3.5-turbo-16k" + gpt_3_5_turbo_0301 = "gpt-3.5-turbo-0301" + gpt_3_5_turbo_0613 = "gpt-3.5-turbo-0613" + gpt_3_5_turbo_1106 = "gpt-3.5-turbo-1106" + gpt_3_5_turbo_0125 = "gpt-3.5-turbo-0125" + gpt_3_5_turbo_16k_0613 = "gpt-3.5-turbo-16k-0613" + + +class Type6(Enum): + text = "text" + json_object = "json_object" + + +class ResponseFormat(BaseModel): + type: Optional[Type6] = Field( + "text", + description="Must be one of `text` or `json_object`.", + example="json_object", + ) + + +class FunctionCall3(Enum): + none = "none" + auto = "auto" + + +class ChatCompletionFinishReason(Enum): + stop = "stop" + length = "length" + tool_calls = "tool_calls" + content_filter = "content_filter" + function_call = "function_call" + + +class Object2(Enum): + chat_completion = "chat.completion" + + +class FinishReason2(Enum): + stop = "stop" + length = "length" + function_call = "function_call" + content_filter = "content_filter" + + +class TopLogprob(BaseModel): + token: str = Field(..., description="The token.") + logprob: float = Field( + ..., + description="The log probability of this token, if it is within the top 20 most likely tokens. Otherwise, the value `-9999.0` is used to signify that the token is very unlikely.", + ) + bytes: List[int] = Field( + ..., + description="A list of integers representing the UTF-8 bytes representation of the token. Useful in instances where characters are represented by multiple tokens and their byte representations must be combined to generate the correct text representation. Can be `null` if there is no bytes representation for the token.", + ) + + +class ChatCompletionTokenLogprob(BaseModel): + token: str = Field(..., description="The token.") + logprob: float = Field( + ..., + description="The log probability of this token, if it is within the top 20 most likely tokens. Otherwise, the value `-9999.0` is used to signify that the token is very unlikely.", + ) + bytes: List[int] = Field( + ..., + description="A list of integers representing the UTF-8 bytes representation of the token. Useful in instances where characters are represented by multiple tokens and their byte representations must be combined to generate the correct text representation. Can be `null` if there is no bytes representation for the token.", + ) + top_logprobs: List[TopLogprob] = Field( + ..., + description="List of the most likely tokens and their log probability, at this token position. In rare cases, there may be fewer than the number of requested `top_logprobs` returned.", + ) + + +class Logprobs2(BaseModel): + content: List[ChatCompletionTokenLogprob] = Field( + ..., + description="A list of message content tokens with log probability information.", + ) + + +class ChatCompletionFinishReason(Enum): + stop = "stop" + length = "length" + tool_calls = "tool_calls" + content_filter = "content_filter" + function_call = "function_call" + + +class ChatCompletionStreamingResponseChoice(BaseModel): + delta: ChatCompletionStreamResponseDelta + logprobs: Optional[Logprobs2] = Field( + None, description="Log probability information for the choice." + ) + finish_reason: ChatCompletionFinishReason | None = Field( + ..., + description="The reason the model stopped generating tokens. This will be `stop` if the model hit a natural stop point or a provided stop sequence,\n`length` if the maximum number of tokens specified in the request was reached,\n`content_filter` if content was omitted due to a flag from our content filters,\n`tool_calls` if the model called a tool, or `function_call` (deprecated) if the model called a function.\n", + ) + index: int = Field( + ..., description="The index of the choice in the list of choices." + ) + + +class Object4(Enum): + chat_completion_chunk = "chat.completion.chunk" + + +class CreateChatCompletionStreamResponse(BaseModel): + id: str = Field( + ..., + description="A unique identifier for the chat completion. Each chunk has the same ID.", + ) + choices: List[ChatCompletionStreamingResponseChoice] = Field( + ..., + description="A list of chat completion choices. Can be more than one if `n` is greater than 1.", + ) + created: int = Field( + ..., + description="The Unix timestamp (in seconds) of when the chat completion was created. Each chunk has the same timestamp.", + ) + model: str = Field(..., description="The model to generate the completion.") + system_fingerprint: Optional[str] = Field( + None, + description="This fingerprint represents the backend configuration that the model runs with.\nCan be used in conjunction with the `seed` request parameter to understand when backend changes have been made that might impact determinism.\n", + ) + object: Object4 = Field( + ..., description="The object type, which is always `chat.completion.chunk`." + ) + + +class CreateChatCompletionImageResponse(BaseModel): + pass + + +class Object5(Enum): + model = "model" + + +class Model(BaseModel): + id: str = Field( + ..., + description="The model identifier, which can be referenced in the API endpoints.", + ) + created: int = Field( + ..., description="The Unix timestamp (in seconds) when the model was created." + ) + object: Object5 = Field( + ..., description='The object type, which is always "model".' + ) + owned_by: str = Field(..., description="The organization that owns the model.") + + +class CompletionUsage(BaseModel): + completion_tokens: int = Field( + ..., description="Number of tokens in the generated completion." + ) + prompt_tokens: int = Field(..., description="Number of tokens in the prompt.") + total_tokens: int = Field( + ..., + description="Total number of tokens used in the request (prompt + completion).", + ) + + +class Event(Enum): + error = "error" + + +class ErrorEvent(BaseModel): + event: Event + data: Error + + +class Event1(Enum): + done = "done" + + +class Data(Enum): + field_DONE_ = "[DONE]" + + +class DoneEvent(BaseModel): + event: Event1 + data: Data + + +class ListModelsResponse(BaseModel): + object: Object + data: List[Model] + + +class CreateCompletionResponse(BaseModel): + id: str = Field(..., description="A unique identifier for the completion.") + choices: List[Choice] = Field( + ..., + description="The list of completion choices the model generated for the input prompt.", + ) + created: int = Field( + ..., + description="The Unix timestamp (in seconds) of when the completion was created.", + ) + model: str = Field(..., description="The model used for completion.") + system_fingerprint: Optional[str] = Field( + None, + description="This fingerprint represents the backend configuration that the model runs with.\n\nCan be used in conjunction with the `seed` request parameter to understand when backend changes have been made that might impact determinism.\n", + ) + object: Object1 = Field( + ..., description='The object type, which is always "text_completion"' + ) + usage: Optional[CompletionUsage] = None + + +class ChatCompletionRequestMessageContentPart(RootModel): + root: Union[ + ChatCompletionRequestMessageContentPartText, + ChatCompletionRequestMessageContentPartImage, + ] + + +class ChatCompletionRequestUserMessage(BaseModel): + content: Union[str, List[ChatCompletionRequestMessageContentPart]] = Field( + ..., description="The contents of the user message.\n" + ) + role: Role1 = Field( + ..., description="The role of the messages author, in this case `user`." + ) + name: Optional[str] = Field( + None, + description="An optional name for the participant. Provides the model information to differentiate between participants of the same role.", + ) + + +class ChatCompletionTool(BaseModel): + type: Type2 = Field( + ..., + description="The type of the tool. Currently, only `function` is supported.", + ) + function: FunctionObject + + +class ChatCompletionToolChoiceOption(RootModel): + root: Union[ChatCompletionToolChoiceOption1, ChatCompletionNamedToolChoice] = Field( + ..., + description='Controls which (if any) tool is called by the model.\n`none` means the model will not call any tool and instead generates a message.\n`auto` means the model can pick between generating a message or calling one or more tools.\n`required` means the model must call one or more tools.\nSpecifying a particular tool via `{"type": "function", "function": {"name": "my_function"}}` forces the model to call that tool.\n\n`none` is the default when no tools are present. `auto` is the default if tools are present.\n', + ) + + +class ChatCompletionMessageToolCalls(RootModel): + root: List[ChatCompletionMessageToolCall] = Field( + ..., + description="The tool calls generated by the model, such as function calls.", + ) + + +class ChatCompletionResponseMessage(BaseModel): + content: str = Field(..., description="The contents of the message.") + tool_calls: Optional[ChatCompletionMessageToolCalls] = None + role: str = Field(..., description="The role of the author of this message.") + function_call: Optional[FunctionCall] = Field( + None, + description="Deprecated and replaced by `tool_calls`. The name and arguments of a function that should be called, as generated by the model.", + ) + + +class ChatCompletionChoice(BaseModel): + finish_reason: ChatCompletionFinishReason = Field( + ..., + description="The reason the model stopped generating tokens. This will be `stop` if the model hit a natural stop point or a provided stop sequence,\n`length` if the maximum number of tokens specified in the request was reached,\n`content_filter` if content was omitted due to a flag from our content filters,\n`tool_calls` if the model called a tool, or `function_call` (deprecated) if the model called a function.\n", + ) + index: int = Field( + ..., description="The index of the choice in the list of choices." + ) + message: ChatCompletionResponseMessage + logprobs: Logprobs2 | None = Field( + ..., description="Log probability information for the choice." + ) + + +class CreateChatCompletionResponse(BaseModel): + id: str = Field(..., description="A unique identifier for the chat completion.") + choices: List[ChatCompletionChoice] = Field( + ..., + description="A list of chat completion choices. Can be more than one if `n` is greater than 1.", + ) + created: int = Field( + ..., + description="The Unix timestamp (in seconds) of when the chat completion was created.", + ) + model: str = Field(..., description="The model used for the chat completion.") + system_fingerprint: Optional[str] = Field( + None, + description="This fingerprint represents the backend configuration that the model runs with.\n\nCan be used in conjunction with the `seed` request parameter to understand when backend changes have been made that might impact determinism.\n", + ) + object: Object2 = Field( + ..., description="The object type, which is always `chat.completion`." + ) + usage: Optional[CompletionUsage] = None + + +class Choice2(BaseModel): + finish_reason: FinishReason2 = Field( + ..., + description="The reason the model stopped generating tokens. This will be `stop` if the model hit a natural stop point or a provided stop sequence, `length` if the maximum number of tokens specified in the request was reached, `content_filter` if content was omitted due to a flag from our content filters, or `function_call` if the model called a function.\n", + ) + index: int = Field( + ..., description="The index of the choice in the list of choices." + ) + message: ChatCompletionResponseMessage + + +class CreateChatCompletionFunctionResponse(BaseModel): + id: str = Field(..., description="A unique identifier for the chat completion.") + choices: List[Choice2] = Field( + ..., + description="A list of chat completion choices. Can be more than one if `n` is greater than 1.", + ) + created: int = Field( + ..., + description="The Unix timestamp (in seconds) of when the chat completion was created.", + ) + model: str = Field(..., description="The model used for the chat completion.") + system_fingerprint: Optional[str] = Field( + None, + description="This fingerprint represents the backend configuration that the model runs with.\n\nCan be used in conjunction with the `seed` request parameter to understand when backend changes have been made that might impact determinism.\n", + ) + object: Object2 = Field( + ..., description="The object type, which is always `chat.completion`." + ) + usage: Optional[CompletionUsage] = None + + +class ChatCompletionRequestAssistantMessage(BaseModel): + content: Optional[str] = Field( + None, + description="The contents of the assistant message. Required unless `tool_calls` or `function_call` is specified.\n", + ) + role: Role2 = Field( + ..., description="The role of the messages author, in this case `assistant`." + ) + name: Optional[str] = Field( + None, + description="An optional name for the participant. Provides the model information to differentiate between participants of the same role.", + ) + tool_calls: Optional[ChatCompletionMessageToolCalls] = None + function_call: Optional[FunctionCall] = Field( + None, + description="Deprecated and replaced by `tool_calls`. The name and arguments of a function that should be called, as generated by the model.", + ) + + +class ChatCompletionRequestMessage(RootModel): + root: Union[ + ChatCompletionRequestSystemMessage, + ChatCompletionRequestUserMessage, + ChatCompletionRequestAssistantMessage, + ChatCompletionRequestToolMessage, + ChatCompletionRequestFunctionMessage, + ] + + @property + def role(self): + return self.root.role + + @property + def content(self): + return self.root.content + + +class CreateChatCompletionRequest(BaseModel): + messages: List[ChatCompletionRequestMessage] = Field( + ..., + description="A list of messages comprising the conversation so far. [Example Python code](https://cookbook.openai.com/examples/how_to_format_inputs_to_chatgpt_models).", + min_items=1, + ) + model: Union[str, Model2] = Field( + ..., + description="ID of the model to use. See the [model endpoint compatibility](/docs/models/model-endpoint-compatibility) table for details on which models work with the Chat API.", + example="gpt-4-turbo", + ) + frequency_penalty: Optional[confloat(ge=-2.0, le=2.0)] = Field( + 0, + description="Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.\n\n[See more information about frequency and presence penalties.](/docs/guides/text-generation/parameter-details)\n", + ) + logit_bias: Optional[Dict[str, int]] = Field( + None, + description="Modify the likelihood of specified tokens appearing in the completion.\n\nAccepts a JSON object that maps tokens (specified by their token ID in the tokenizer) to an associated bias value from -100 to 100. Mathematically, the bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model, but values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should result in a ban or exclusive selection of the relevant token.\n", + ) + logprobs: Optional[bool] = Field( + False, + description="Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the `content` of `message`.", + ) + top_logprobs: Optional[conint(ge=0, le=20)] = Field( + None, + description="An integer between 0 and 20 specifying the number of most likely tokens to return at each token position, each with an associated log probability. `logprobs` must be set to `true` if this parameter is used.", + ) + max_tokens: Optional[int] = Field( + 8168, + description="The maximum number of [tokens](/tokenizer) that can be generated in the chat completion.\n\nThe total length of input tokens and generated tokens is limited by the model's context length. [Example Python code](https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken) for counting tokens.\n", + ) + n: Optional[conint(ge=1, le=128)] = Field( + 1, + description="How many chat completion choices to generate for each input message. Note that you will be charged based on the number of generated tokens across all of the choices. Keep `n` as `1` to minimize costs.", + example=1, + ) + presence_penalty: Optional[confloat(ge=-2.0, le=2.0)] = Field( + 0, + description="Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.\n\n[See more information about frequency and presence penalties.](/docs/guides/text-generation/parameter-details)\n", + ) + response_format: Optional[ResponseFormat] = Field( + None, + description='An object specifying the format that the model must output. Compatible with [GPT-4 Turbo](/docs/models/gpt-4-and-gpt-4-turbo) and all GPT-3.5 Turbo models newer than `gpt-3.5-turbo-1106`.\n\nSetting to `{ "type": "json_object" }` enables JSON mode, which guarantees the message the model generates is valid JSON.\n\n**Important:** when using JSON mode, you **must** also instruct the model to produce JSON yourself via a system or user message. Without this, the model may generate an unending stream of whitespace until the generation reaches the token limit, resulting in a long-running and seemingly "stuck" request. Also note that the message content may be partially cut off if `finish_reason="length"`, which indicates the generation exceeded `max_tokens` or the conversation exceeded the max context length.\n', + ) + seed: Optional[conint(ge=-9223372036854775808, le=9223372036854775807)] = Field( + None, + description="This feature is in Beta.\nIf specified, our system will make a best effort to sample deterministically, such that repeated requests with the same `seed` and parameters should return the same result.\nDeterminism is not guaranteed, and you should refer to the `system_fingerprint` response parameter to monitor changes in the backend.\n", + ) + stop: Optional[Union[str, List[str]]] = Field( + None, + description="Up to 4 sequences where the API will stop generating further tokens.\n", + ) + stream: Optional[bool] = Field( + False, + description="If set, partial message deltas will be sent, like in ChatGPT. Tokens will be sent as data-only [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format) as they become available, with the stream terminated by a `data: [DONE]` message. [Example Python code](https://cookbook.openai.com/examples/how_to_stream_completions).\n", + ) + temperature: Optional[confloat(ge=0.0, le=2.0)] = Field( + 0.7, + description="What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.\n\nWe generally recommend altering this or `top_p` but not both.\n", + example=1, + ) + top_p: Optional[confloat(ge=0.0, le=1.0)] = Field( + 1, + description="An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.\n\nWe generally recommend altering this or `temperature` but not both.\n", + example=1, + ) + tools: Optional[List[ChatCompletionTool]] = Field( + None, + description="A list of tools the model may call. Currently, only functions are supported as a tool. Use this to provide a list of functions the model may generate JSON inputs for. A max of 128 functions are supported.\n", + ) + tool_choice: Optional[ChatCompletionToolChoiceOption] = None + user: Optional[str] = Field( + None, + description="A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse. [Learn more](/docs/guides/safety-best-practices/end-user-ids).\n", + example="user-1234", + ) + function_call: Optional[ + Union[FunctionCall3, ChatCompletionFunctionCallOption] + ] = Field( + None, + description='Deprecated in favor of `tool_choice`.\n\nControls which (if any) function is called by the model.\n`none` means the model will not call a function and instead generates a message.\n`auto` means the model can pick between generating a message or calling a function.\nSpecifying a particular function via `{"name": "my_function"}` forces the model to call that function.\n\n`none` is the default when no functions are present. `auto` is the default if functions are present.\n', + ) + functions: Optional[List[ChatCompletionFunctions]] = Field( + None, + description="Deprecated in favor of `tools`.\n\nA list of functions the model may generate JSON inputs for.\n", + max_items=128, + min_items=1, + ) + + +# Additional Aliases for Convenience + + +class ObjectType: + model = Object5.model + list = Object.list + text_completion = Object1.text_completion + chat_completion_chunk = Object4.chat_completion_chunk + chat_completion = Object2.chat_completion diff --git a/qa/L0_openai/openai_tritonserver/app/test_main.py b/qa/L0_openai/openai_tritonserver/app/test_main.py new file mode 100644 index 0000000000..cb17374351 --- /dev/null +++ b/qa/L0_openai/openai_tritonserver/app/test_main.py @@ -0,0 +1,27 @@ +import os +import tempfile + +from fastapi.testclient import TestClient + +from .main import app + +client = TestClient(app) + + +def test_health_success(): + # Context Manager to trigger app lifespan: + # https://fastapi.tiangolo.com/advanced/testing-events/ + with tempfile.TemporaryDirectory() as model_repository: + os.environ["TRITON_MODEL_REPOSITORY"] = model_repository + with TestClient(app) as client: + response = client.get("/health") + assert response.status_code == 200 + + +def test_health_fail(): + # Context Manager to trigger app lifespan: + # https://fastapi.tiangolo.com/advanced/testing-events/ + os.environ["TRITON_MODEL_REPOSITORY"] = "/does/not/exist" + with TestClient(app) as client: + response = client.get("/health") + assert response.status_code == 400 diff --git a/qa/L0_openai/openai_tritonserver/app/transformers_utils/__init__.py b/qa/L0_openai/openai_tritonserver/app/transformers_utils/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/qa/L0_openai/openai_tritonserver/app/transformers_utils/tokenizer.py b/qa/L0_openai/openai_tritonserver/app/transformers_utils/tokenizer.py new file mode 100644 index 0000000000..0011172c19 --- /dev/null +++ b/qa/L0_openai/openai_tritonserver/app/transformers_utils/tokenizer.py @@ -0,0 +1,150 @@ +import os +from typing import Optional, Union + +from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast + +# from vllm.config import VLLM_USE_MODELSCOPE +# from vllm.logger import init_logger +# from vllm.lora.request import LoRARequest +# from vllm.transformers_utils.tokenizers import BaichuanTokenizer +# from vllm.utils import make_async + +# logger = init_logger(__name__) + +VLLM_USE_MODELSCOPE = False + + +def get_cached_tokenizer( + tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast] +) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: + """Get tokenizer with cached properties. + + This will patch the tokenizer object in place. + + By default, transformers will recompute multiple tokenizer properties + each time they are called, leading to a significant slowdown. This + function caches these properties for faster access.""" + + tokenizer_all_special_ids = set(tokenizer.all_special_ids) + tokenizer_all_special_tokens_extended = tokenizer.all_special_tokens_extended + tokenizer_all_special_tokens = set(tokenizer.all_special_tokens) + tokenizer_len = len(tokenizer) + + class CachedTokenizer(tokenizer.__class__): # type: ignore + @property + def all_special_ids(self): + return tokenizer_all_special_ids + + @property + def all_special_tokens(self): + return tokenizer_all_special_tokens + + @property + def all_special_tokens_extended(self): + return tokenizer_all_special_tokens_extended + + def __len__(self): + return tokenizer_len + + CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}" + + tokenizer.__class__ = CachedTokenizer + return tokenizer + + +def get_tokenizer( + tokenizer_name: str, + *args, + tokenizer_mode: str = "auto", + trust_remote_code: bool = False, + tokenizer_revision: Optional[str] = None, + download_dir: Optional[str] = None, + **kwargs, +) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: + """Gets a tokenizer for the given model name via Huggingface/modelscope.""" + if VLLM_USE_MODELSCOPE: + # download model from ModelScope hub, + # lazy import so that modelscope is not required for normal use. + # pylint: disable=C. + from modelscope.hub.snapshot_download import snapshot_download + + # Only set the tokenizer here, model will be downloaded on the workers. + if not os.path.exists(tokenizer_name): + tokenizer_path = snapshot_download( + model_id=tokenizer_name, + cache_dir=download_dir, + revision=tokenizer_revision, + # Ignore weights - we only need the tokenizer. + ignore_file_pattern=["*.pt", "*.safetensors", "*.bin"], + ) + tokenizer_name = tokenizer_path + + if tokenizer_mode == "slow": + if kwargs.get("use_fast", False): + raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.") + kwargs["use_fast"] = False + + try: + tokenizer = AutoTokenizer.from_pretrained( + tokenizer_name, + *args, + trust_remote_code=trust_remote_code, + tokenizer_revision=tokenizer_revision, + **kwargs, + ) + except ValueError as e: + # If the error pertains to the tokenizer class not existing or not + # currently being imported, suggest using the --trust-remote-code flag. + if not trust_remote_code and ( + "does not exist or is not currently imported." in str(e) + or "requires you to execute the tokenizer file" in str(e) + ): + err_msg = ( + "Failed to load the tokenizer. If the tokenizer is a custom " + "tokenizer not yet available in the HuggingFace transformers " + "library, consider setting `trust_remote_code=True` in LLM " + "or using the `--trust-remote-code` flag in the CLI." + ) + raise RuntimeError(err_msg) from e + else: + raise e + except AttributeError as e: + # if "BaichuanTokenizer" in str(e): + # # This is for the error "'BaichuanTokenizer' object has no + # # attribute 'sp_model'". + # tokenizer = BaichuanTokenizer.from_pretrained( + # tokenizer_name, + # *args, + # trust_remote_code=trust_remote_code, + # tokenizer_revision=tokenizer_revision, + # **kwargs) + # else: + raise e + + if not isinstance(tokenizer, PreTrainedTokenizerFast): + print( + "Using a slow tokenizer. This might cause a significant " + "slowdown. Consider using a fast tokenizer instead." + ) + return get_cached_tokenizer(tokenizer) + + +# def get_lora_tokenizer(lora_request: LoRARequest, *args, +# **kwargs) -> Optional[PreTrainedTokenizer]: +# if lora_request is None: +# return None +# try: +# tokenizer = get_tokenizer(lora_request.lora_local_path, *args, +# **kwargs) +# except OSError as e: +# # No tokenizer was found in the LoRA folder, +# # use base model tokenizer +# logger.warning( +# f"No tokenizer found in {lora_request.lora_local_path}, " +# "using base model tokenizer instead. " +# f"(Exception: {str(e)})") +# tokenizer = None +# return tokenizer + + +# get_lora_tokenizer_async = make_async(get_lora_tokenizer) From e14128bbe0af1221a5c3974581b99e4503955d14 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Thu, 1 Aug 2024 17:40:09 -0700 Subject: [PATCH 02/80] Restructure to recommended FastAPI project structure, add simple testing for utilities and models routes --- .../Dockerfile | 0 qa/L0_openai/example/main.py | 50 ++ .../app => example/src}/__init__.py | 0 qa/L0_openai/example/src/api_server.py | 53 ++ .../src/routers}/__init__.py | 0 .../example/src/routers/chat_completions.py | 131 +++++ .../example/src/routers/completions.py | 95 ++++ qa/L0_openai/example/src/routers/models.py | 56 ++ qa/L0_openai/example/src/routers/utilities.py | 24 + qa/L0_openai/example/src/schemas/__init__.py | 0 .../src/schemas/openai.py} | 0 qa/L0_openai/example/src/tests/__init__.py | 0 .../src/tests/test_chat_completions.py | 0 .../example/src/tests/test_completions.py | 0 .../example/src/tests/test_utilities.py | 50 ++ qa/L0_openai/example/src/utils/__init__.py | 0 .../src/utils}/tokenizer.py | 0 qa/L0_openai/example/src/utils/triton.py | 120 ++++ qa/L0_openai/openai_tritonserver/app/main.py | 516 ------------------ .../openai_tritonserver/app/test_main.py | 27 - 20 files changed, 579 insertions(+), 543 deletions(-) rename qa/L0_openai/{openai_tritonserver => example}/Dockerfile (100%) create mode 100644 qa/L0_openai/example/main.py rename qa/L0_openai/{openai_tritonserver/app => example/src}/__init__.py (100%) create mode 100644 qa/L0_openai/example/src/api_server.py rename qa/L0_openai/{openai_tritonserver/app/transformers_utils => example/src/routers}/__init__.py (100%) create mode 100644 qa/L0_openai/example/src/routers/chat_completions.py create mode 100644 qa/L0_openai/example/src/routers/completions.py create mode 100644 qa/L0_openai/example/src/routers/models.py create mode 100644 qa/L0_openai/example/src/routers/utilities.py create mode 100644 qa/L0_openai/example/src/schemas/__init__.py rename qa/L0_openai/{openai_tritonserver/app/openai_protocol_types.py => example/src/schemas/openai.py} (100%) create mode 100644 qa/L0_openai/example/src/tests/__init__.py create mode 100644 qa/L0_openai/example/src/tests/test_chat_completions.py create mode 100644 qa/L0_openai/example/src/tests/test_completions.py create mode 100644 qa/L0_openai/example/src/tests/test_utilities.py create mode 100644 qa/L0_openai/example/src/utils/__init__.py rename qa/L0_openai/{openai_tritonserver/app/transformers_utils => example/src/utils}/tokenizer.py (100%) create mode 100644 qa/L0_openai/example/src/utils/triton.py delete mode 100644 qa/L0_openai/openai_tritonserver/app/main.py delete mode 100644 qa/L0_openai/openai_tritonserver/app/test_main.py diff --git a/qa/L0_openai/openai_tritonserver/Dockerfile b/qa/L0_openai/example/Dockerfile similarity index 100% rename from qa/L0_openai/openai_tritonserver/Dockerfile rename to qa/L0_openai/example/Dockerfile diff --git a/qa/L0_openai/example/main.py b/qa/L0_openai/example/main.py new file mode 100644 index 0000000000..75a9528e4c --- /dev/null +++ b/qa/L0_openai/example/main.py @@ -0,0 +1,50 @@ +import argparse + +import uvicorn +from src.api_server import app + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Triton OpenAI Compatible RESTful API server." + ) + parser.add_argument("--host", type=str, default=None, help="host name") + parser.add_argument("--port", type=int, default=8000, help="port number") + parser.add_argument( + "--uvicorn-log-level", + type=str, + default="info", + choices=["debug", "info", "warning", "error", "critical", "trace"], + help="log level for uvicorn", + ) + parser.add_argument( + "--response-role", type=str, default="assistant", help="The role name to return" + ) + + parser.add_argument( + "--tritonserver-log-level", + type=int, + default=0, + help="The tritonserver log level", + ) + + parser.add_argument( + "--model-repository", + type=str, + default="/workspace/llm-models", + help="model repository", + ) + return parser.parse_args() + + +if __name__ == "__main__": + # TODO: Cleanup + args = parse_args() + + uvicorn.run( + app, + host=args.host, + port=args.port, + log_level=args.uvicorn_log_level, + timeout_keep_alive=5, + ) diff --git a/qa/L0_openai/openai_tritonserver/app/__init__.py b/qa/L0_openai/example/src/__init__.py similarity index 100% rename from qa/L0_openai/openai_tritonserver/app/__init__.py rename to qa/L0_openai/example/src/__init__.py diff --git a/qa/L0_openai/example/src/api_server.py b/qa/L0_openai/example/src/api_server.py new file mode 100644 index 0000000000..37f49ab5b7 --- /dev/null +++ b/qa/L0_openai/example/src/api_server.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +from contextlib import asynccontextmanager +from typing import Union + +import tritonserver +from fastapi import FastAPI +from src.routers import chat_completions, completions, models, utilities +from src.utils.triton import init_tritonserver +from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast + + +@asynccontextmanager +async def lifespan(app: FastAPI): + print("Starting FastAPI app lifespan...") + # Start the tritonserver on FastAPI app startup + app.server = init_tritonserver() + + yield + + # Cleanup the tritonserver on FastAPI app shutdown + print("Shutting down FastAPI app lifespan...") + if app.server: + print("Shutting down Triton Inference Server...") + app.server.stop() + + +app = FastAPI( + title="OpenAI API", + description="The OpenAI REST API. Please see https://platform.openai.com/docs/api-reference for more details.", + version="2.0.0", + termsOfService="https://openai.com/policies/terms-of-use", + contact={"name": "OpenAI Support", "url": "https://help.openai.com/"}, + license={ + "name": "MIT", + "url": "https://github.com/openai/openai-openapi/blob/master/LICENSE", + }, + servers=[{"url": "https://api.openai.com/v1"}], + lifespan=lifespan, +) + +app.include_router(utilities.router) +app.include_router(models.router) +app.include_router(completions.router) +app.include_router(chat_completions.router) + +server: tritonserver.Server +model: tritonserver.Model +model_source_name: str +model_create_time: int +backend: str +tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast] +create_inference_request = None diff --git a/qa/L0_openai/openai_tritonserver/app/transformers_utils/__init__.py b/qa/L0_openai/example/src/routers/__init__.py similarity index 100% rename from qa/L0_openai/openai_tritonserver/app/transformers_utils/__init__.py rename to qa/L0_openai/example/src/routers/__init__.py diff --git a/qa/L0_openai/example/src/routers/chat_completions.py b/qa/L0_openai/example/src/routers/chat_completions.py new file mode 100644 index 0000000000..4be4bcdf57 --- /dev/null +++ b/qa/L0_openai/example/src/routers/chat_completions.py @@ -0,0 +1,131 @@ +from fastapi import APIRouter +from fastapi.responses import StreamingResponse +from src.schemas.openai import ( + ChatCompletionChoice, + ChatCompletionFinishReason, + ChatCompletionResponseMessage, + ChatCompletionStreamingResponseChoice, + ChatCompletionStreamResponseDelta, + CreateChatCompletionRequest, + CreateChatCompletionResponse, + CreateChatCompletionStreamResponse, + ObjectType, +) +from src.utils.triton import get_output + +router = APIRouter() + + +def streaming_chat_completion_response(request_id, created, model, role, responses): + # first chunk + + choice = ChatCompletionStreamingResponseChoice( + index=0, + delta=ChatCompletionStreamResponseDelta( + role=role, content=None, function_call=None + ), + logprobs=None, + finish_reason=None, + ) + chunk = CreateChatCompletionStreamResponse( + id=request_id, + choices=[choice], + created=created, + model=model, + system_fingerprint=None, + object=ObjectType.chat_completion_chunk, + ) + yield f"data: {chunk.json(exclude_unset=True)}\n\n" + + for response in responses: + text = get_output(response) + + choice = ChatCompletionStreamingResponseChoice( + index=0, + delta=ChatCompletionStreamResponseDelta( + role=None, content=text, function_call=None + ), + logprobs=None, + finish_reason=ChatCompletionFinishReason.stop if response.final else None, + ) + + chunk = CreateChatCompletionStreamResponse( + id=request_id, + choices=[choice], + created=created, + model=model, + system_fingerprint=None, + object=ObjectType.chat_completion_chunk, + ) + + yield f"data: {chunk.json(exclude_unset=True)}\n\n" + + yield "data: [DONE]\n\n" + + +@router.post( + "/v1/chat/completions", response_model=CreateChatCompletionResponse, tags=["Chat"] +) +def create_chat_completion( + request: CreateChatCompletionRequest, +) -> CreateChatCompletionResponse | StreamingResponse: + """ + Creates a model response for the given chat conversation. + """ + + if not model or not tokenizer or not create_inference_request: + raise Exception("Unknown Model") + + add_generation_prompt_default = True + default_role = "assistant" + + if request.model != model.name and request.model != model_source_name: + raise HTTPException(status_code=404, detail=f"Unknown model: {request.model}") + + if request.n and request.n > 1: + raise HTTPException(status_code=400, detail=f"Only single choice is supported") + + conversation = [ + {"role": str(message.role), "content": str(message.content)} + for message in request.messages + ] + + prompt = tokenizer.apply_chat_template( + conversation=conversation, + tokenize=False, + add_generation_prompt=add_generation_prompt_default, + ) + + request_id = f"cmpl-{uuid.uuid1()}" + created = int(time.time()) + + responses = model.infer(create_inference_request(model, prompt, request)) + + if request.stream: + return StreamingResponse( + streaming_chat_completion_response( + request_id, created, request.model, conversation[-1]["role"], responses + ) + ) + + response = list(responses)[0] + + text = get_output(response) + + return CreateChatCompletionResponse( + id=request_id, + choices=[ + ChatCompletionChoice( + index=0, + message=ChatCompletionResponseMessage( + content=text, role=default_role, function_call=None + ), + logprobs=None, + finish_reason=ChatCompletionFinishReason.stop, + ) + ], + created=created, + model=request.model, + system_fingerprint=None, + object=ObjectType.chat_completion, + ) diff --git a/qa/L0_openai/example/src/routers/completions.py b/qa/L0_openai/example/src/routers/completions.py new file mode 100644 index 0000000000..2a4ae22a48 --- /dev/null +++ b/qa/L0_openai/example/src/routers/completions.py @@ -0,0 +1,95 @@ +from fastapi import APIRouter, Request +from fastapi.responses import StreamingResponse +from src.schemas.openai import ( + Choice, + CreateCompletionRequest, + CreateCompletionResponse, + FinishReason, + ObjectType, +) +from src.utils.triton import get_output + +router = APIRouter() + + +def streaming_completion_response(request_id, created, model, responses): + for response in responses: + text = get_output(response) + + choice = Choice( + finish_reason=FinishReason.stop if response.final else None, + index=0, + logprobs=None, + text=text, + ) + response = CreateCompletionResponse( + id=request_id, + choices=[choice], + system_fingerprint=None, + object=ObjectType.text_completion, + created=created, + model=model, + ) + + yield f"data: {response.json(exclude_unset=True)}\n\n" + yield "data: [DONE]\n\n" + + +@router.post( + "/v1/completions", response_model=CreateCompletionResponse, tags=["Completions"] +) +def create_completion( + request: CreateCompletionRequest, raw_request: Request +) -> CreateCompletionResponse | StreamingResponse: + """ + Creates a completion for the provided prompt and parameters. + """ + + if not model or not tokenizer or not create_inference_request: + raise Exception("Unknown Model") + + if request.suffix is not None: + raise HTTPException(status_code=400, detail="suffix is not currently supported") + + if request.model != model.name and request.model != model_source_name: + raise HTTPException(status_code=404, detail=f"Unknown model: {request.model}") + + if request.prompt is None: + request.prompt = "<|endoftext|>" + + # Currently only support single string as input + if not isinstance(request.prompt, str): + raise HTTPException( + status_code=400, detail="only single string input is supported" + ) + + if request.logit_bias is not None or request.logprobs is not None: + raise HTTPException( + status_code=400, detail="logit bias and log probs not supported" + ) + + request_id = f"cmpl-{uuid.uuid1()}" + created = int(time.time()) + + responses = model.infer(create_inference_request(model, request.prompt, request)) + if request.stream: + return StreamingResponse( + streaming_completion_response(request_id, created, model.name, responses) + ) + response = list(responses)[0] + text = get_output(response) + + choice = Choice( + finish_reason=FinishReason.stop if response.final else None, + index=0, + logprobs=None, + text=text, + ) + return CreateCompletionResponse( + id=request_id, + choices=[choice], + system_fingerprint=None, + object=ObjectType.text_completion, + created=created, + model=model.name, + ) diff --git a/qa/L0_openai/example/src/routers/models.py b/qa/L0_openai/example/src/routers/models.py new file mode 100644 index 0000000000..297ded8d50 --- /dev/null +++ b/qa/L0_openai/example/src/routers/models.py @@ -0,0 +1,56 @@ +from fastapi import APIRouter +from src.schemas.openai import ListModelsResponse, Model, ObjectType + +router = APIRouter() + +# TODO: What is this for? +OWNED_BY = "ACME" + + +@router.get("/v1/models", response_model=ListModelsResponse, tags=["Models"]) +def list_models() -> ListModelsResponse: + """ + Lists the currently available models, and provides basic information about each one such as the owner and availability. + """ + + model_list = [ + Model( + id=model.name, + created=model_create_time, + object=ObjectType.model, + owned_by=OWNED_BY, + ), + Model( + id=model_source_name, + created=model_create_time, + object=ObjectType.model, + owned_by=OWNED_BY, + ), + ] + + return ListModelsResponse(object=ObjectType.list, data=model_list) + + +@router.get("/v1/models/{model_name}", response_model=Model, tags=["Models"]) +def retrieve_model(model_name: str) -> Model: + """ + Retrieves a model instance, providing basic information about the model such as the owner and permissioning. + """ + + if model_name == model.name: + return Model( + id=model.name, + created=model_create_time, + object=ObjectType.model, + owned_by=OWNED_BY, + ) + + if model_name == model_source_name: + return Model( + id=model_source_name, + created=model_create_time, + object=ObjectType.model, + owned_by=OWNED_BY, + ) + + raise HTTPException(status_code=404, detail=f"Unknown model: {model_name}") diff --git a/qa/L0_openai/example/src/routers/utilities.py b/qa/L0_openai/example/src/routers/utilities.py new file mode 100644 index 0000000000..98d506dab5 --- /dev/null +++ b/qa/L0_openai/example/src/routers/utilities.py @@ -0,0 +1,24 @@ +from fastapi import APIRouter, HTTPException, Request +from fastapi.responses import Response + +router = APIRouter() + + +@router.get("/metrics", tags=["Utilities"]) +def metrics(request: Request) -> str: + if not request.app.server or not request.app.server.live(): + raise HTTPException( + status_code=400, detail="Triton Inference Server is not live." + ) + + return request.app.server.metrics() + + +@router.get("/health", tags=["Utilities"]) +def health(request: Request) -> Response: + if not request.app.server or not request.app.server.live(): + raise HTTPException( + status_code=400, detail="Triton Inference Server is not live." + ) + + return Response(status_code=200) diff --git a/qa/L0_openai/example/src/schemas/__init__.py b/qa/L0_openai/example/src/schemas/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/qa/L0_openai/openai_tritonserver/app/openai_protocol_types.py b/qa/L0_openai/example/src/schemas/openai.py similarity index 100% rename from qa/L0_openai/openai_tritonserver/app/openai_protocol_types.py rename to qa/L0_openai/example/src/schemas/openai.py diff --git a/qa/L0_openai/example/src/tests/__init__.py b/qa/L0_openai/example/src/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/qa/L0_openai/example/src/tests/test_chat_completions.py b/qa/L0_openai/example/src/tests/test_chat_completions.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/qa/L0_openai/example/src/tests/test_completions.py b/qa/L0_openai/example/src/tests/test_completions.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/qa/L0_openai/example/src/tests/test_utilities.py b/qa/L0_openai/example/src/tests/test_utilities.py new file mode 100644 index 0000000000..1bf74dda01 --- /dev/null +++ b/qa/L0_openai/example/src/tests/test_utilities.py @@ -0,0 +1,50 @@ +import os +import tempfile + +import pytest +from fastapi.testclient import TestClient +from src.api_server import app + +client = TestClient(app) + + +# TODO: Use fixture for less verbose model repo prep +# @pytest.fixture(scope="session") +# def setup_model_repository(): +# pass + + +def test_not_found(): + with TestClient(app) as client: + response = client.get("/does-not-exist") + assert response.status_code == 404 + + +def test_startup_metrics(): + with tempfile.TemporaryDirectory() as model_repository: + os.environ["TRITON_MODEL_REPOSITORY"] = model_repository + with TestClient(app) as client: + response = client.get("/metrics") + assert response.status_code == 200 + # FIXME: Flesh out more + # NOTE: response.json() works even on non-json prometheus data? + assert "nv_cpu_utilization" in response.json() + # No models loaded, no per-model metrics + assert "nv_inference_count" not in response.json() + + +def test_startup_success(): + with tempfile.TemporaryDirectory() as model_repository: + os.environ["TRITON_MODEL_REPOSITORY"] = model_repository + with TestClient(app) as client: + response = client.get("/health") + assert response.status_code == 200 + + +def test_startup_fail(): + os.environ["TRITON_MODEL_REPOSITORY"] = "/does/not/exist" + with pytest.raises(Exception): + # Test that FastAPI lifespan startup fails when initializing Triton + # with unknown model repository. + with TestClient(app) as client: + pass diff --git a/qa/L0_openai/example/src/utils/__init__.py b/qa/L0_openai/example/src/utils/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/qa/L0_openai/openai_tritonserver/app/transformers_utils/tokenizer.py b/qa/L0_openai/example/src/utils/tokenizer.py similarity index 100% rename from qa/L0_openai/openai_tritonserver/app/transformers_utils/tokenizer.py rename to qa/L0_openai/example/src/utils/tokenizer.py diff --git a/qa/L0_openai/example/src/utils/triton.py b/qa/L0_openai/example/src/utils/triton.py new file mode 100644 index 0000000000..9e749ba182 --- /dev/null +++ b/qa/L0_openai/example/src/utils/triton.py @@ -0,0 +1,120 @@ +import os + +import numpy as np +import tritonserver +from src.schemas.openai import CreateChatCompletionRequest, CreateCompletionRequest +from src.utils.tokenizer import get_tokenizer + +# TODO: Remove +SUPPORTED_BACKENDS: set = {"vllm", "tensorrtllm"} +KNOWN_MODELS = {"gpt2": "hf:gpt2"} + + +# TODO: Re-organize helpers +def load_model(server): + model = None + backends = [] + tokenizer = None + model_source_name = None + for model_name, version in server.models().keys(): + if version != -1: + continue + current_model = server.load(model_name) + backends.append(current_model.config()["backend"]) + if model_name in KNOWN_MODELS.keys(): + model = current_model + model_source_name = KNOWN_MODELS[model_name].replace("hf:", "") + tokenizer = get_tokenizer(model_source_name) + if model and tokenizer: + for backend in backends: + if backend in SUPPORTED_BACKENDS: + return model, int(time.time()), backend, tokenizer, model_source_name + return None, None, None, None, None + + +def init_tritonserver(): + # TODO: How to pass arguments to server here? + model_repository = os.environ.get( + "TRITON_MODEL_REPOSITORY", "/opt/tritonserver/models" + ) + + print("Starting Triton Server Core...") + server = tritonserver.Server( + model_repository=model_repository, + log_verbose=1, + log_info=True, + log_warn=True, + log_error=True, + model_control_mode=tritonserver.ModelControlMode.EXPLICIT, + ).start(wait_until_ready=True) + + # TODO: Cleanup + # print("Loading Model...\n\n") + + # model, model_create_time, backend, tokenizer, model_source_name = load_model(server) + + # if not (model and backend and tokenizer and model_create_time): + # raise Exception("Unknown Model") + + # print(f"\n\nModel: {model.name} Loaded with Backend: {backend}\n\n") + + # if backend == "vllm": + # create_inference_request = create_vllm_inference_request + # elif backend == "tensorrtllm": + # create_inference_request = create_trtllm_inference_request + + return server + + +def get_output(response): + if "text_output" in response.outputs: + try: + return response.outputs["text_output"].to_string_array()[0] + except: + return str(response.outputs["text_output"].to_bytes_array()[0]) + return None + + +def create_vllm_inference_request( + model, prompt, request: CreateChatCompletionRequest | CreateCompletionRequest +): + inputs = {} + sampling_parameters = request.copy( + exclude={"model", "stream", "messages", "prompt", "echo"}, + ).model_dump(exclude_none=True) + inputs["text_input"] = [prompt] + inputs["stream"] = [request.stream] + exclude_input_in_output = True + echo = getattr(request, "echo", None) + if echo: + exclude_input_in_output = not echo + inputs["exclude_input_in_output"] = [exclude_input_in_output] + return model.create_request(inputs=inputs, parameters=sampling_parameters) + + +def create_trtllm_inference_request( + model, prompt, request: CreateChatCompletionRequest | CreateCompletionRequest +): + inputs = {} + if model.name == "llama-3-8b-instruct": + inputs["stop_words"] = [["<|eot_id|>", "<|end_of_text|>"]] + inputs["text_input"] = [[prompt]] + inputs["stream"] = [[request.stream]] + if request.max_tokens: + inputs["max_tokens"] = np.int32([[request.max_tokens]]) + if request.stop: + if isinstance(request.stop, str): + request.stop = [request.stop] + inputs["stop_words"] = [request.stop] + if request.top_p: + inputs["top_p"] = np.float32([[request.top_p]]) + if request.frequency_penalty: + inputs["frequency_penalty"] = np.float32([[request.frequency_penalty]]) + if request.presence_penalty: + inputs["presence_penalty":] = np.int32([[request.presence_penalty]]) + if request.seed: + inputs["random_seed"] = np.uint64([[request.seed]]) + if request.temperature: + inputs["temperature"] = np.float32([[request.temperature]]) + + return model.create_request(inputs=inputs) diff --git a/qa/L0_openai/openai_tritonserver/app/main.py b/qa/L0_openai/openai_tritonserver/app/main.py deleted file mode 100644 index 3881564480..0000000000 --- a/qa/L0_openai/openai_tritonserver/app/main.py +++ /dev/null @@ -1,516 +0,0 @@ -# generated by fastapi-codegen: -# filename: openai_trimmed.yml -# timestamp: 2024-05-05T21:52:36+00:00 - -from __future__ import annotations - -import argparse -import os -import time -import uuid -from contextlib import asynccontextmanager -from typing import Optional, Union - -import numpy -import tritonserver -import uvicorn -from fastapi import FastAPI, HTTPException, Request -from fastapi.responses import Response, StreamingResponse - -# TODO: transformer utils needed? -from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast - -from .openai_protocol_types import ( - ChatCompletionChoice, - ChatCompletionFinishReason, - ChatCompletionResponseMessage, - ChatCompletionStreamingResponseChoice, - ChatCompletionStreamResponseDelta, - Choice, - CreateChatCompletionRequest, - CreateChatCompletionResponse, - CreateChatCompletionStreamResponse, - CreateCompletionRequest, - CreateCompletionResponse, - FinishReason, - ListModelsResponse, - Model, - ObjectType, -) -from .transformers_utils.tokenizer import get_tokenizer - -# TODO: Remove -SUPPORTED_BACKENDS: set = {"vllm", "tensorrtllm"} -KNOWN_MODELS = {"gpt2": "hf:gpt2"} - -# TODO: What is this for? -OWNED_BY = "ACME" -TIMEOUT_KEEP_ALIVE = 5 # seconds - -server: tritonserver.Server -model: tritonserver.Model -model_source_name: str -model_create_time: int -backend: str -tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast] -create_inference_request = None - - -# TODO: Re-organize helpers -def load_model(server): - model = None - backends = [] - tokenizer = None - model_source_name = None - for model_name, version in server.models().keys(): - if version != -1: - continue - current_model = server.load(model_name) - backends.append(current_model.config()["backend"]) - if model_name in KNOWN_MODELS.keys(): - model = current_model - model_source_name = KNOWN_MODELS[model_name].replace("hf:", "") - tokenizer = get_tokenizer(model_source_name) - if model and tokenizer: - for backend in backends: - if backend in SUPPORTED_BACKENDS: - return model, int(time.time()), backend, tokenizer, model_source_name - return None, None, None, None, None - - -def init_tritonserver(): - # TODO: How to pass arguments to server here? - model_repository = os.environ.get( - "TRITON_MODEL_REPOSITORY", "/opt/tritonserver/models" - ) - - print("Starting Triton Server Core...") - server = tritonserver.Server( - model_repository=model_repository, - log_verbose=1, - log_info=True, - log_warn=True, - log_error=True, - model_control_mode=tritonserver.ModelControlMode.EXPLICIT, - ).start(wait_until_ready=True) - - # TODO: Cleanup - # print("Loading Model...\n\n") - - # model, model_create_time, backend, tokenizer, model_source_name = load_model(server) - - # if not (model and backend and tokenizer and model_create_time): - # raise Exception("Unknown Model") - - # print(f"\n\nModel: {model.name} Loaded with Backend: {backend}\n\n") - - # if backend == "vllm": - # create_inference_request = create_vllm_inference_request - # elif backend == "tensorrtllm": - # create_inference_request = create_trtllm_inference_request - - return server - - -@asynccontextmanager -async def lifespan(app: FastAPI): - print("Starting FastAPI app lifespan...") - # Start the tritonserver on FastAPI app startup - try: - print("Starting Triton Inference Server...") - app.server = init_tritonserver() - except Exception as e: - print(f"Failed to start Triton Inference Server: {e}") - app.server = None - - yield - - # Cleanup the tritonserver on FastAPI app shutdown - print("Shutting down FastAPI app lifespan...") - if app.server: - print("Shutting down Triton Inference Server...") - app.server.stop() - - -app = FastAPI( - title="OpenAI API", - description="The OpenAI REST API. Please see https://platform.openai.com/docs/api-reference for more details.", - version="2.0.0", - termsOfService="https://openai.com/policies/terms-of-use", - contact={"name": "OpenAI Support", "url": "https://help.openai.com/"}, - license={ - "name": "MIT", - "url": "https://github.com/openai/openai-openapi/blob/master/LICENSE", - }, - servers=[{"url": "https://api.openai.com/v1"}], - lifespan=lifespan, -) - - -# TODO: use router? -@app.get("/health") -def health() -> Response: - if not app.server: - raise HTTPException( - status_code=400, - detail="Triton Inference Server failed to start successfully.", - ) - - if not app.server.live(): - raise HTTPException( - status_code=400, detail="Triton Inference Server is not live." - ) - - return Response(status_code=200) - - -def get_output(response): - if "text_output" in response.outputs: - try: - return response.outputs["text_output"].to_string_array()[0] - except: - return str(response.outputs["text_output"].to_bytes_array()[0]) - return None - - -def streaming_chat_completion_response(request_id, created, model, role, responses): - # first chunk - - choice = ChatCompletionStreamingResponseChoice( - index=0, - delta=ChatCompletionStreamResponseDelta( - role=role, content=None, function_call=None - ), - logprobs=None, - finish_reason=None, - ) - chunk = CreateChatCompletionStreamResponse( - id=request_id, - choices=[choice], - created=created, - model=model, - system_fingerprint=None, - object=ObjectType.chat_completion_chunk, - ) - yield f"data: {chunk.json(exclude_unset=True)}\n\n" - - for response in responses: - text = get_output(response) - - choice = ChatCompletionStreamingResponseChoice( - index=0, - delta=ChatCompletionStreamResponseDelta( - role=None, content=text, function_call=None - ), - logprobs=None, - finish_reason=ChatCompletionFinishReason.stop if response.final else None, - ) - - chunk = CreateChatCompletionStreamResponse( - id=request_id, - choices=[choice], - created=created, - model=model, - system_fingerprint=None, - object=ObjectType.chat_completion_chunk, - ) - - yield f"data: {chunk.json(exclude_unset=True)}\n\n" - - yield "data: [DONE]\n\n" - - -def create_vllm_inference_request( - model, prompt, request: CreateChatCompletionRequest | CreateCompletionRequest -): - inputs = {} - sampling_parameters = request.copy( - exclude={"model", "stream", "messages", "prompt", "echo"}, - ).model_dump(exclude_none=True) - inputs["text_input"] = [prompt] - inputs["stream"] = [request.stream] - exclude_input_in_output = True - echo = getattr(request, "echo", None) - if echo: - exclude_input_in_output = not echo - inputs["exclude_input_in_output"] = [exclude_input_in_output] - return model.create_request(inputs=inputs, parameters=sampling_parameters) - - -def create_trtllm_inference_request( - model, prompt, request: CreateChatCompletionRequest | CreateCompletionRequest -): - inputs = {} - if model.name == "llama-3-8b-instruct": - inputs["stop_words"] = [["<|eot_id|>", "<|end_of_text|>"]] - inputs["text_input"] = [[prompt]] - inputs["stream"] = [[request.stream]] - if request.max_tokens: - inputs["max_tokens"] = numpy.int32([[request.max_tokens]]) - if request.stop: - if isinstance(request.stop, str): - request.stop = [request.stop] - inputs["stop_words"] = [request.stop] - if request.top_p: - inputs["top_p"] = numpy.float32([[request.top_p]]) - if request.frequency_penalty: - inputs["frequency_penalty"] = numpy.float32([[request.frequency_penalty]]) - if request.presence_penalty: - inputs["presence_penalty":] = numpy.int32([[request.presence_penalty]]) - if request.seed: - inputs["random_seed"] = numpy.uint64([[request.seed]]) - if request.temperature: - inputs["temperature"] = numpy.float32([[request.temperature]]) - - return model.create_request(inputs=inputs) - - -@app.post( - "/v1/chat/completions", response_model=CreateChatCompletionResponse, tags=["Chat"] -) -def create_chat_completion( - request: CreateChatCompletionRequest, -) -> CreateChatCompletionResponse | StreamingResponse: - """ - Creates a model response for the given chat conversation. - """ - - if not model or not tokenizer or not create_inference_request: - raise Exception("Unknown Model") - - add_generation_prompt_default = True - default_role = "assistant" - - if request.model != model.name and request.model != model_source_name: - raise HTTPException(status_code=404, detail=f"Unknown model: {request.model}") - - if request.n and request.n > 1: - raise HTTPException(status_code=400, detail=f"Only single choice is supported") - - conversation = [ - {"role": str(message.role), "content": str(message.content)} - for message in request.messages - ] - - prompt = tokenizer.apply_chat_template( - conversation=conversation, - tokenize=False, - add_generation_prompt=add_generation_prompt_default, - ) - - request_id = f"cmpl-{uuid.uuid1()}" - created = int(time.time()) - - responses = model.infer(create_inference_request(model, prompt, request)) - - if request.stream: - return StreamingResponse( - streaming_chat_completion_response( - request_id, created, request.model, conversation[-1]["role"], responses - ) - ) - - response = list(responses)[0] - - text = get_output(response) - - return CreateChatCompletionResponse( - id=request_id, - choices=[ - ChatCompletionChoice( - index=0, - message=ChatCompletionResponseMessage( - content=text, role=default_role, function_call=None - ), - logprobs=None, - finish_reason=ChatCompletionFinishReason.stop, - ) - ], - created=created, - model=request.model, - system_fingerprint=None, - object=ObjectType.chat_completion, - ) - - -def streaming_completion_response(request_id, created, model, responses): - for response in responses: - text = get_output(response) - - choice = Choice( - finish_reason=FinishReason.stop if response.final else None, - index=0, - logprobs=None, - text=text, - ) - response = CreateCompletionResponse( - id=request_id, - choices=[choice], - system_fingerprint=None, - object=ObjectType.text_completion, - created=created, - model=model, - ) - - yield f"data: {response.json(exclude_unset=True)}\n\n" - yield "data: [DONE]\n\n" - - -@app.post( - "/v1/completions", response_model=CreateCompletionResponse, tags=["Completions"] -) -def create_completion( - request: CreateCompletionRequest, raw_request: Request -) -> CreateCompletionResponse | StreamingResponse: - """ - Creates a completion for the provided prompt and parameters. - """ - - if not model or not tokenizer or not create_inference_request: - raise Exception("Unknown Model") - - if request.suffix is not None: - raise HTTPException(status_code=400, detail="suffix is not currently supported") - - if request.model != model.name and request.model != model_source_name: - raise HTTPException(status_code=404, detail=f"Unknown model: {request.model}") - - if request.prompt is None: - request.prompt = "<|endoftext|>" - - # Currently only support single string as input - if not isinstance(request.prompt, str): - raise HTTPException( - status_code=400, detail="only single string input is supported" - ) - - if request.logit_bias is not None or request.logprobs is not None: - raise HTTPException( - status_code=400, detail="logit bias and log probs not supported" - ) - - request_id = f"cmpl-{uuid.uuid1()}" - created = int(time.time()) - - responses = model.infer(create_inference_request(model, request.prompt, request)) - if request.stream: - return StreamingResponse( - streaming_completion_response(request_id, created, model.name, responses) - ) - response = list(responses)[0] - text = get_output(response) - - choice = Choice( - finish_reason=FinishReason.stop if response.final else None, - index=0, - logprobs=None, - text=text, - ) - return CreateCompletionResponse( - id=request_id, - choices=[choice], - system_fingerprint=None, - object=ObjectType.text_completion, - created=created, - model=model.name, - ) - - -@app.get("/metrics") -def metrics() -> str: - return server.metrics() - - -@app.get("/v1/models", response_model=ListModelsResponse, tags=["Models"]) -def list_models() -> ListModelsResponse: - """ - Lists the currently available models, and provides basic information about each one such as the owner and availability. - """ - - model_list = [ - Model( - id=model.name, - created=model_create_time, - object=ObjectType.model, - owned_by=OWNED_BY, - ), - Model( - id=model_source_name, - created=model_create_time, - object=ObjectType.model, - owned_by=OWNED_BY, - ), - ] - - return ListModelsResponse(object=ObjectType.list, data=model_list) - - -@app.get("/v1/models/{model_name}", response_model=Model, tags=["Models"]) -def retrieve_model(model_name: str) -> Model: - """ - Retrieves a model instance, providing basic information about the model such as the owner and permissioning. - """ - - if model_name == model.name: - return Model( - id=model.name, - created=model_create_time, - object=ObjectType.model, - owned_by=OWNED_BY, - ) - - if model_name == model_source_name: - return Model( - id=model_source_name, - created=model_create_time, - object=ObjectType.model, - owned_by=OWNED_BY, - ) - - raise HTTPException(status_code=404, detail=f"Unknown model: {model_name}") - - -def parse_args(): - parser = argparse.ArgumentParser( - description="Triton OpenAI Compatible RESTful API server." - ) - parser.add_argument("--host", type=str, default=None, help="host name") - parser.add_argument("--port", type=int, default=8000, help="port number") - parser.add_argument( - "--uvicorn-log-level", - type=str, - default="info", - choices=["debug", "info", "warning", "error", "critical", "trace"], - help="log level for uvicorn", - ) - parser.add_argument( - "--response-role", type=str, default="assistant", help="The role name to return" - ) - - parser.add_argument( - "--tritonserver-log-level", - type=int, - default=0, - help="The tritonserver log level", - ) - - parser.add_argument( - "--model-repository", - type=str, - default="/workspace/llm-models", - help="model repository", - ) - return parser.parse_args() - - -if __name__ == "__main__": - # TODO: Cleanup - args = parse_args() - - uvicorn.run( - app, - host=args.host, - port=args.port, - log_level=args.uvicorn_log_level, - timeout_keep_alive=TIMEOUT_KEEP_ALIVE, - ) diff --git a/qa/L0_openai/openai_tritonserver/app/test_main.py b/qa/L0_openai/openai_tritonserver/app/test_main.py deleted file mode 100644 index cb17374351..0000000000 --- a/qa/L0_openai/openai_tritonserver/app/test_main.py +++ /dev/null @@ -1,27 +0,0 @@ -import os -import tempfile - -from fastapi.testclient import TestClient - -from .main import app - -client = TestClient(app) - - -def test_health_success(): - # Context Manager to trigger app lifespan: - # https://fastapi.tiangolo.com/advanced/testing-events/ - with tempfile.TemporaryDirectory() as model_repository: - os.environ["TRITON_MODEL_REPOSITORY"] = model_repository - with TestClient(app) as client: - response = client.get("/health") - assert response.status_code == 200 - - -def test_health_fail(): - # Context Manager to trigger app lifespan: - # https://fastapi.tiangolo.com/advanced/testing-events/ - os.environ["TRITON_MODEL_REPOSITORY"] = "/does/not/exist" - with TestClient(app) as client: - response = client.get("/health") - assert response.status_code == 400 From a37b0b3e42d47d15205b459d183a767ffdda6fef Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Thu, 1 Aug 2024 17:45:04 -0700 Subject: [PATCH 03/80] Start a CONTRIBUTING.md --- qa/L0_openai/CONTRIBUTING.md | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 qa/L0_openai/CONTRIBUTING.md diff --git a/qa/L0_openai/CONTRIBUTING.md b/qa/L0_openai/CONTRIBUTING.md new file mode 100644 index 0000000000..8e758a7956 --- /dev/null +++ b/qa/L0_openai/CONTRIBUTING.md @@ -0,0 +1,34 @@ +# Triton Inference Server OpenAI Example + +## Development + +For simplicity, a `Dockerfile` containing the necessary +dependencies is included, which can be modified and built +for your needs. + +``` +docker build -t fastapi_triton . +# TODO: minimal args +docker run ... fastapi_triton +# TODO: cd to location +fastapi dev +``` + +## Testing + +The testing for this example is all done through `pytest`, which +is well integrated with `FastAPI`. + +``` +cd src/tests +pytest +``` + +## Adding New Routes + +First define your own router in `src/routers`, referring +to the existing routers as examples. + +Then, add your router to the application in `api_server.py` +with `app.include_router(my_router)`. + From 7eb1ffc75e1fb0c455ff3d5e632a8d14738ed6a9 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Fri, 2 Aug 2024 18:50:02 -0700 Subject: [PATCH 04/80] Add simple /completions endpoint test --- qa/L0_openai/example/README.md | 15 ++++++ qa/L0_openai/example/main.py | 15 +++--- .../example/src/routers/chat_completions.py | 2 + .../example/src/routers/completions.py | 30 ++++++++--- qa/L0_openai/example/src/schemas/openai.py | 41 ++++++++------- .../src/tests/test_chat_completions.py | 47 +++++++++++++++++ .../example/src/tests/test_completions.py | 52 +++++++++++++++++++ .../src/tests/test_models/gpt2/1/model.json | 1 + .../src/tests/test_models/gpt2/config.pbtxt | 2 + .../example/src/tests/test_utilities.py | 3 -- qa/L0_openai/example/src/utils/triton.py | 24 +++++---- 11 files changed, 185 insertions(+), 47 deletions(-) create mode 100644 qa/L0_openai/example/README.md create mode 100644 qa/L0_openai/example/src/tests/test_models/gpt2/1/model.json create mode 100644 qa/L0_openai/example/src/tests/test_models/gpt2/config.pbtxt diff --git a/qa/L0_openai/example/README.md b/qa/L0_openai/example/README.md new file mode 100644 index 0000000000..948889c916 --- /dev/null +++ b/qa/L0_openai/example/README.md @@ -0,0 +1,15 @@ +Goal: + +``` +docker build -t tritonserver-openai:latest . +docker run -it --net=host --gpus all --rm \ + tritonserver-openai:latest \ + --model gpt2 +``` + +Testing: +- Verify known issues are fixed or not + - concurrency, parameter corruption, etc. + - check out Tanmay's fix for using numpy arrays instead of native types + - exclude_input_in_output overwritten at high concurrency? + - ? diff --git a/qa/L0_openai/example/main.py b/qa/L0_openai/example/main.py index 75a9528e4c..fb7c1e61db 100644 --- a/qa/L0_openai/example/main.py +++ b/qa/L0_openai/example/main.py @@ -1,4 +1,5 @@ import argparse +import os import uvicorn from src.api_server import app @@ -8,6 +9,7 @@ def parse_args(): parser = argparse.ArgumentParser( description="Triton OpenAI Compatible RESTful API server." ) + # Uvicorn parser.add_argument("--host", type=str, default=None, help="host name") parser.add_argument("--port", type=int, default=8000, help="port number") parser.add_argument( @@ -17,29 +19,30 @@ def parse_args(): choices=["debug", "info", "warning", "error", "critical", "trace"], help="log level for uvicorn", ) - parser.add_argument( - "--response-role", type=str, default="assistant", help="The role name to return" - ) + # Triton parser.add_argument( "--tritonserver-log-level", type=int, default=0, - help="The tritonserver log level", + help="The tritonserver log verbosity level", ) parser.add_argument( "--model-repository", type=str, - default="/workspace/llm-models", + default="/opt/tritonserver/models", help="model repository", ) return parser.parse_args() if __name__ == "__main__": - # TODO: Cleanup args = parse_args() + # NOTE: Think about other ways to pass triton args to fastapi app, + # but use env vars for simplicity for now. + os.environ["TRITON_MODEL_REPOSITORY"] = args.model_repository + os.environ["TRITON_LOG_VERBOSE_LEVEL"] = args.tritonserver_log_level uvicorn.run( app, diff --git a/qa/L0_openai/example/src/routers/chat_completions.py b/qa/L0_openai/example/src/routers/chat_completions.py index 4be4bcdf57..c0629ed92f 100644 --- a/qa/L0_openai/example/src/routers/chat_completions.py +++ b/qa/L0_openai/example/src/routers/chat_completions.py @@ -90,6 +90,8 @@ def create_chat_completion( for message in request.messages ] + # TODO: Use HF tokenizer or use Jinja/templater directly? + # TODO: Function Calling / tools related to this? prompt = tokenizer.apply_chat_template( conversation=conversation, tokenize=False, diff --git a/qa/L0_openai/example/src/routers/completions.py b/qa/L0_openai/example/src/routers/completions.py index 2a4ae22a48..02e1bfb730 100644 --- a/qa/L0_openai/example/src/routers/completions.py +++ b/qa/L0_openai/example/src/routers/completions.py @@ -1,4 +1,7 @@ -from fastapi import APIRouter, Request +import time +import uuid + +from fastapi import APIRouter, HTTPException, Request from fastapi.responses import StreamingResponse from src.schemas.openai import ( Choice, @@ -7,7 +10,7 @@ FinishReason, ObjectType, ) -from src.utils.triton import get_output +from src.utils.triton import create_vllm_inference_request, get_output router = APIRouter() @@ -45,17 +48,24 @@ def create_completion( Creates a completion for the provided prompt and parameters. """ - if not model or not tokenizer or not create_inference_request: - raise Exception("Unknown Model") + if not request.model: + raise Exception("No Model Provided") + + model = raw_request.app.server.model(request.model) + + # if not not tokenizer or not create_inference_request: + # raise Exception("Unknown Model") if request.suffix is not None: raise HTTPException(status_code=400, detail="suffix is not currently supported") - if request.model != model.name and request.model != model_source_name: + if request.model != model.name: raise HTTPException(status_code=404, detail=f"Unknown model: {request.model}") - if request.prompt is None: - request.prompt = "<|endoftext|>" + if not request.prompt: + # TODO: Needed? + # request.prompt = "<|endoftext|>" + raise HTTPException(status_code=400, detail="prompt must be non-empty") # Currently only support single string as input if not isinstance(request.prompt, str): @@ -71,7 +81,11 @@ def create_completion( request_id = f"cmpl-{uuid.uuid1()}" created = int(time.time()) - responses = model.infer(create_inference_request(model, request.prompt, request)) + # TODO: Determine backend, using hard-coded vllm for simplicity + # responses = model.infer(create_inference_request(model, request.prompt, request)) + responses = model.infer( + create_vllm_inference_request(model, request.prompt, request) + ) if request.stream: return StreamingResponse( streaming_completion_response(request_id, created, model.name, responses) diff --git a/qa/L0_openai/example/src/schemas/openai.py b/qa/L0_openai/example/src/schemas/openai.py index 5082bbea8d..c8c6a9b40f 100644 --- a/qa/L0_openai/example/src/schemas/openai.py +++ b/qa/L0_openai/example/src/schemas/openai.py @@ -7,7 +7,7 @@ from enum import Enum from typing import Any, Dict, List, Optional, Union -from pydantic import AnyUrl, BaseModel, Extra, Field, RootModel, confloat, conint +from pydantic import AnyUrl, BaseModel, ConfigDict, Field, RootModel, confloat, conint class Error(BaseModel): @@ -72,12 +72,12 @@ class CreateCompletionRequest(BaseModel): max_tokens: Optional[conint(ge=0)] = Field( 16, description="The maximum number of [tokens](/tokenizer) that can be generated in the completion.\n\nThe token count of your prompt plus `max_tokens` cannot exceed the model's context length. [Example Python code](https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken) for counting tokens.\n", - example=16, + examples=[16], ) n: Optional[conint(ge=1, le=128)] = Field( 1, description="How many completions to generate for each prompt.\n\n**Note:** Because this parameter generates many completions, it can quickly consume your token quota. Use carefully and ensure that you have reasonable settings for `max_tokens` and `stop`.\n", - example=1, + examples=[1], ) presence_penalty: Optional[confloat(ge=-2.0, le=2.0)] = Field( 0, @@ -98,22 +98,22 @@ class CreateCompletionRequest(BaseModel): suffix: Optional[str] = Field( None, description="The suffix that comes after a completion of inserted text.\n\nThis parameter is only supported for `gpt-3.5-turbo-instruct`.\n", - example="test.", + examples=["test."], ) temperature: Optional[confloat(ge=0.0, le=2.0)] = Field( 1, description="What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.\n\nWe generally recommend altering this or `top_p` but not both.\n", - example=1, + examples=[1], ) top_p: Optional[confloat(ge=0.0, le=1.0)] = Field( 1, description="An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.\n\nWe generally recommend altering this or `temperature` but not both.\n", - example=1, + examples=[1], ) user: Optional[str] = Field( None, description="A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse. [Learn more](/docs/guides/safety-best-practices/end-user-ids).\n", - example="user-1234", + examples=["user-1234"], ) @@ -251,10 +251,11 @@ class ChatCompletionRequestFunctionMessage(BaseModel): class FunctionParameters(BaseModel): - pass - - class Config: - extra = Extra.allow + model_config = ConfigDict(extra="allow") + # class Config: + # # TODO: Remove + # #extra = Extra.allow + # extra = "allow" class ChatCompletionFunctions(BaseModel): @@ -420,7 +421,7 @@ class ResponseFormat(BaseModel): type: Optional[Type6] = Field( "text", description="Must be one of `text` or `json_object`.", - example="json_object", + examples=["json_object"], ) @@ -773,12 +774,12 @@ class CreateChatCompletionRequest(BaseModel): messages: List[ChatCompletionRequestMessage] = Field( ..., description="A list of messages comprising the conversation so far. [Example Python code](https://cookbook.openai.com/examples/how_to_format_inputs_to_chatgpt_models).", - min_items=1, + min_length=1, ) model: Union[str, Model2] = Field( ..., description="ID of the model to use. See the [model endpoint compatibility](/docs/models/model-endpoint-compatibility) table for details on which models work with the Chat API.", - example="gpt-4-turbo", + examples=["gpt-4-turbo"], ) frequency_penalty: Optional[confloat(ge=-2.0, le=2.0)] = Field( 0, @@ -803,7 +804,7 @@ class CreateChatCompletionRequest(BaseModel): n: Optional[conint(ge=1, le=128)] = Field( 1, description="How many chat completion choices to generate for each input message. Note that you will be charged based on the number of generated tokens across all of the choices. Keep `n` as `1` to minimize costs.", - example=1, + examples=[1], ) presence_penalty: Optional[confloat(ge=-2.0, le=2.0)] = Field( 0, @@ -828,12 +829,12 @@ class CreateChatCompletionRequest(BaseModel): temperature: Optional[confloat(ge=0.0, le=2.0)] = Field( 0.7, description="What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.\n\nWe generally recommend altering this or `top_p` but not both.\n", - example=1, + examples=[1], ) top_p: Optional[confloat(ge=0.0, le=1.0)] = Field( 1, description="An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.\n\nWe generally recommend altering this or `temperature` but not both.\n", - example=1, + examples=[1], ) tools: Optional[List[ChatCompletionTool]] = Field( None, @@ -843,7 +844,7 @@ class CreateChatCompletionRequest(BaseModel): user: Optional[str] = Field( None, description="A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse. [Learn more](/docs/guides/safety-best-practices/end-user-ids).\n", - example="user-1234", + examples=["user-1234"], ) function_call: Optional[ Union[FunctionCall3, ChatCompletionFunctionCallOption] @@ -854,8 +855,8 @@ class CreateChatCompletionRequest(BaseModel): functions: Optional[List[ChatCompletionFunctions]] = Field( None, description="Deprecated in favor of `tools`.\n\nA list of functions the model may generate JSON inputs for.\n", - max_items=128, - min_items=1, + max_length=128, + min_length=1, ) diff --git a/qa/L0_openai/example/src/tests/test_chat_completions.py b/qa/L0_openai/example/src/tests/test_chat_completions.py index e69de29bb2..cccaef7fd2 100644 --- a/qa/L0_openai/example/src/tests/test_chat_completions.py +++ b/qa/L0_openai/example/src/tests/test_chat_completions.py @@ -0,0 +1,47 @@ +import pytest +from fastapi.testclient import TestClient +from src.api_server import app + + +# Test for Chat Completions API +@pytest.mark.parametrize( + "sampling_parameter, value", + [ + ("temperature", 0.7), + ("max_tokens", 10), + ("top_p", 0.9), + ("frequency_penalty", 0.5), + ("presence_penalty", 0.2), + ], +) +def test_chat_completions_sampling_parameters(sampling_parameter, value): + # Arrange + messages = [{"role": "user", "content": "Hello"}] + expected_response = "Hi there" + + # Act + with TestClient(app) as client: + response = client.post( + "/chat/completions", + json={ + "model": "gpt-3.5-turbo", + "messages": messages, + sampling_parameter: value, + }, + ) + + # Assert + assert response.status_code == 200 + assert response.json()["choices"][0]["message"]["content"] == expected_response + + +# Test for handling invalid chat input +def test_invalid_chat_input(): + # Act + with TestClient(app) as client: + response = client.post( + "/chat/completions", json={"model": "gpt-3.5-turbo", "messages": []} + ) + + # Assert + assert response.status_code == 400 diff --git a/qa/L0_openai/example/src/tests/test_completions.py b/qa/L0_openai/example/src/tests/test_completions.py index e69de29bb2..e68bd42f1f 100644 --- a/qa/L0_openai/example/src/tests/test_completions.py +++ b/qa/L0_openai/example/src/tests/test_completions.py @@ -0,0 +1,52 @@ +import os +from pathlib import Path + +import pytest +from fastapi.testclient import TestClient +from src.api_server import app + +TEST_MODEL = "gpt2" + + +# TODO: May need to modify fixture scope +@pytest.fixture(scope="module", autouse=True) +def setup_model_repository(): + model_repository = Path(__file__).parent / "test_models" + os.environ["TRITON_MODEL_REPOSITORY"] = str(model_repository) + + +def test_completions_sampling_parameters(): + prompt = "Hello" + + # Iterate through parameters within test to avoid constant server + # startup/shutdown when using TestClient. This can likely be refactored. + request_parameters = [ + ("temperature", 0.7), + ("max_tokens", 10), + ("top_p", 0.9), + ("frequency_penalty", 0.5), + ("presence_penalty", 0.2), + ] + + with TestClient(app) as client: + for parameter, value in request_parameters: + response = client.post( + "/v1/completions", + json={"model": TEST_MODEL, "prompt": prompt, parameter: value}, + ) + + print("Response:", response.json()) + assert response.status_code == 200 + # TODO: Flesh out or use dummy identity model + assert response.json()["choices"][0]["text"].strip() + + +# Test for handling invalid prompt +def test_empty_prompt(): + with TestClient(app) as client: + response = client.post( + "/v1/completions", json={"model": TEST_MODEL, "prompt": ""} + ) + + # Assert + assert response.status_code == 400 diff --git a/qa/L0_openai/example/src/tests/test_models/gpt2/1/model.json b/qa/L0_openai/example/src/tests/test_models/gpt2/1/model.json new file mode 100644 index 0000000000..96f398c471 --- /dev/null +++ b/qa/L0_openai/example/src/tests/test_models/gpt2/1/model.json @@ -0,0 +1 @@ +{"model": "gpt2", "disable_log_requests": true, "gpu_memory_utilization": 0.85} \ No newline at end of file diff --git a/qa/L0_openai/example/src/tests/test_models/gpt2/config.pbtxt b/qa/L0_openai/example/src/tests/test_models/gpt2/config.pbtxt new file mode 100644 index 0000000000..a3edd238de --- /dev/null +++ b/qa/L0_openai/example/src/tests/test_models/gpt2/config.pbtxt @@ -0,0 +1,2 @@ +backend: "vllm" +instance_group { kind: KIND_MODEL } diff --git a/qa/L0_openai/example/src/tests/test_utilities.py b/qa/L0_openai/example/src/tests/test_utilities.py index 1bf74dda01..9cbb9d2bbf 100644 --- a/qa/L0_openai/example/src/tests/test_utilities.py +++ b/qa/L0_openai/example/src/tests/test_utilities.py @@ -5,9 +5,6 @@ from fastapi.testclient import TestClient from src.api_server import app -client = TestClient(app) - - # TODO: Use fixture for less verbose model repo prep # @pytest.fixture(scope="session") # def setup_model_repository(): diff --git a/qa/L0_openai/example/src/utils/triton.py b/qa/L0_openai/example/src/utils/triton.py index 9e749ba182..b593d32ff4 100644 --- a/qa/L0_openai/example/src/utils/triton.py +++ b/qa/L0_openai/example/src/utils/triton.py @@ -1,4 +1,5 @@ import os +import time import numpy as np import tritonserver @@ -33,15 +34,15 @@ def load_model(server): def init_tritonserver(): - # TODO: How to pass arguments to server here? model_repository = os.environ.get( "TRITON_MODEL_REPOSITORY", "/opt/tritonserver/models" ) + log_verbose_level = int(os.environ.get("TRITON_LOG_VERBOSE_LEVEL", "0")) print("Starting Triton Server Core...") server = tritonserver.Server( model_repository=model_repository, - log_verbose=1, + log_verbose=log_verbose_level, log_info=True, log_warn=True, log_error=True, @@ -49,14 +50,14 @@ def init_tritonserver(): ).start(wait_until_ready=True) # TODO: Cleanup - # print("Loading Model...\n\n") + print("Loading Model...\n\n") - # model, model_create_time, backend, tokenizer, model_source_name = load_model(server) + model, model_create_time, backend, tokenizer, model_source_name = load_model(server) - # if not (model and backend and tokenizer and model_create_time): - # raise Exception("Unknown Model") + if not (model and backend and tokenizer and model_create_time): + raise Exception("Unknown Model") - # print(f"\n\nModel: {model.name} Loaded with Backend: {backend}\n\n") + print(f"\n\nModel: {model.name} Loaded with Backend: {backend}\n\n") # if backend == "vllm": # create_inference_request = create_vllm_inference_request @@ -79,9 +80,12 @@ def create_vllm_inference_request( model, prompt, request: CreateChatCompletionRequest | CreateCompletionRequest ): inputs = {} - sampling_parameters = request.copy( - exclude={"model", "stream", "messages", "prompt", "echo"}, - ).model_dump(exclude_none=True) + excludes = {"model", "stream", "messages", "prompt", "echo"} + # FIXME: It seems that some subset of these keys will cause the model to not return a response + addl_excludes = {"user", "seed", "stop", "suffix", "logprobs", "logit_bias"} + sampling_parameters = request.model_dump( + exclude=excludes.union(addl_excludes), + ) inputs["text_input"] = [prompt] inputs["stream"] = [request.stream] exclude_input_in_output = True From 530c8710b23fb80b15db1e5d8c77f1572b7e01c6 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Mon, 5 Aug 2024 17:37:38 -0700 Subject: [PATCH 05/80] Add some plumbing for /v1/models routes, add mock_llm python model to speed up testing, next step add tests for /v1/models routes --- qa/L0_openai/example/main.py | 15 ++- qa/L0_openai/example/src/api_server.py | 30 ++++- qa/L0_openai/example/src/routers/models.py | 54 +++++---- .../src/tests/test_models/mock_llm/1/model.py | 108 ++++++++++++++++++ .../tests/test_models/mock_llm/config.pbtxt | 60 ++++++++++ .../gpt2/1/model.json | 0 .../gpt2/config.pbtxt | 0 qa/L0_openai/example/src/utils/triton.py | 74 +++++++++--- 8 files changed, 297 insertions(+), 44 deletions(-) create mode 100644 qa/L0_openai/example/src/tests/test_models/mock_llm/1/model.py create mode 100644 qa/L0_openai/example/src/tests/test_models/mock_llm/config.pbtxt rename qa/L0_openai/example/src/tests/{test_models => vllm_models}/gpt2/1/model.json (100%) rename qa/L0_openai/example/src/tests/{test_models => vllm_models}/gpt2/config.pbtxt (100%) diff --git a/qa/L0_openai/example/main.py b/qa/L0_openai/example/main.py index fb7c1e61db..12632319bd 100644 --- a/qa/L0_openai/example/main.py +++ b/qa/L0_openai/example/main.py @@ -10,9 +10,10 @@ def parse_args(): description="Triton OpenAI Compatible RESTful API server." ) # Uvicorn - parser.add_argument("--host", type=str, default=None, help="host name") - parser.add_argument("--port", type=int, default=8000, help="port number") - parser.add_argument( + uvicorn_group = parser.add_argument_group("Uvicorn") + uvicorn_group.add_argument("--host", type=str, default=None, help="host name") + uvicorn_group.add_argument("--port", type=int, default=8000, help="port number") + uvicorn_group.add_argument( "--uvicorn-log-level", type=str, default="info", @@ -21,19 +22,21 @@ def parse_args(): ) # Triton - parser.add_argument( + triton_group = parser.add_argument_group("Triton Inference Server") + triton_group.add_argument( "--tritonserver-log-level", type=int, default=0, help="The tritonserver log verbosity level", ) - parser.add_argument( + triton_group.add_argument( "--model-repository", type=str, default="/opt/tritonserver/models", help="model repository", ) + return parser.parse_args() @@ -42,7 +45,7 @@ def parse_args(): # NOTE: Think about other ways to pass triton args to fastapi app, # but use env vars for simplicity for now. os.environ["TRITON_MODEL_REPOSITORY"] = args.model_repository - os.environ["TRITON_LOG_VERBOSE_LEVEL"] = args.tritonserver_log_level + os.environ["TRITON_LOG_VERBOSE_LEVEL"] = str(args.tritonserver_log_level) uvicorn.run( app, diff --git a/qa/L0_openai/example/src/api_server.py b/qa/L0_openai/example/src/api_server.py index 37f49ab5b7..10b740e978 100644 --- a/qa/L0_openai/example/src/api_server.py +++ b/qa/L0_openai/example/src/api_server.py @@ -5,16 +5,37 @@ import tritonserver from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware from src.routers import chat_completions, completions, models, utilities from src.utils.triton import init_tritonserver from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast +def add_cors_middleware(app: FastAPI): + # Allow API calls through browser /docs route for debug purposes + origins = [ + "http://localhost", + ] + + print(f"[WARNING] Adding CORS for the following origins: {origins}") + app.add_middleware( + CORSMiddleware, + allow_origins=origins, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) + + @asynccontextmanager async def lifespan(app: FastAPI): print("Starting FastAPI app lifespan...") # Start the tritonserver on FastAPI app startup - app.server = init_tritonserver() + server, model_metadata = init_tritonserver() + app.server = server + # TODO: Clean up or refactor this flow to store models for /v1/models endpoints + app.models = {} + app.models[model_metadata.name] = model_metadata yield @@ -35,7 +56,8 @@ async def lifespan(app: FastAPI): "name": "MIT", "url": "https://github.com/openai/openai-openapi/blob/master/LICENSE", }, - servers=[{"url": "https://api.openai.com/v1"}], + # TODO: Do we need this? This affects the endpoints used in /docs endpoint. + # servers=[{"url": "https://api.openai.com/v1"}], lifespan=lifespan, ) @@ -44,6 +66,10 @@ async def lifespan(app: FastAPI): app.include_router(completions.router) app.include_router(chat_completions.router) +# NOTE: For debugging purposes, should generally be restricted or removed +add_cors_middleware(app) + +# TODO: Refactor/remove globals where not necessary server: tritonserver.Server model: tritonserver.Model model_source_name: str diff --git a/qa/L0_openai/example/src/routers/models.py b/qa/L0_openai/example/src/routers/models.py index 297ded8d50..6446fda813 100644 --- a/qa/L0_openai/example/src/routers/models.py +++ b/qa/L0_openai/example/src/routers/models.py @@ -1,4 +1,4 @@ -from fastapi import APIRouter +from fastapi import APIRouter, HTTPException, Request from src.schemas.openai import ListModelsResponse, Model, ObjectType router = APIRouter() @@ -8,47 +8,59 @@ @router.get("/v1/models", response_model=ListModelsResponse, tags=["Models"]) -def list_models() -> ListModelsResponse: +def list_models(request: Request) -> ListModelsResponse: """ Lists the currently available models, and provides basic information about each one such as the owner and availability. """ + models = request.app.models + if not models: + raise HTTPException(status_code=400, detail="No known models") - model_list = [ - Model( - id=model.name, - created=model_create_time, - object=ObjectType.model, - owned_by=OWNED_BY, - ), - Model( - id=model_source_name, - created=model_create_time, - object=ObjectType.model, - owned_by=OWNED_BY, - ), - ] + model_list = [] + for model in models: + metadata = models[model] + if not metadata: + raise HTTPException( + status_code=400, detail=f"No metadata for model: {model}" + ) + + model_list.append( + Model( + id=metadata.name, + created=metadata.create_time, + object=ObjectType.model, + owned_by=OWNED_BY, + ), + ) return ListModelsResponse(object=ObjectType.list, data=model_list) @router.get("/v1/models/{model_name}", response_model=Model, tags=["Models"]) -def retrieve_model(model_name: str) -> Model: +def retrieve_model(request: Request, model_name: str) -> Model: """ Retrieves a model instance, providing basic information about the model such as the owner and permissioning. """ + models = request.app.models + if not models: + raise HTTPException(status_code=400, detail="No known models") + + model = models.get(model_name) + if not model: + raise HTTPException(status_code=400, detail=f"Unknown model: {model_name}") if model_name == model.name: return Model( id=model.name, - created=model_create_time, + created=model.create_time, object=ObjectType.model, owned_by=OWNED_BY, ) - if model_name == model_source_name: + if model_name == model.source_name: return Model( - id=model_source_name, - created=model_create_time, + id=model.source_name, + created=model.create_time, object=ObjectType.model, owned_by=OWNED_BY, ) diff --git a/qa/L0_openai/example/src/tests/test_models/mock_llm/1/model.py b/qa/L0_openai/example/src/tests/test_models/mock_llm/1/model.py new file mode 100644 index 0000000000..1cf5f3613c --- /dev/null +++ b/qa/L0_openai/example/src/tests/test_models/mock_llm/1/model.py @@ -0,0 +1,108 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import time + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def initialize(self, args): + self.model_config = json.loads(args["model_config"]) + self.decoupled = self.model_config.get("model_transaction_policy", {}).get( + "decoupled" + ) + + def execute(self, requests): + if self.decoupled: + return self.exec_decoupled(requests) + else: + return self.exec(requests) + + def exec(self, requests): + responses = [] + for request in requests: + params = json.loads(request.parameters()) + rep_count = params["REPETITION"] if "REPETITION" in params else 1 + + input_np = pb_utils.get_input_tensor_by_name( + request, "text_intpu" + ).as_numpy() + stream_np = pb_utils.get_input_tensor_by_name(request, "stream").as_numpy() + stream = stream_np.flatten()[0] + if stream: + responses.append( + pb_utils.InferenceResponse( + error=pb_utils.TritonError( + "STREAM only supported in decoupled mode" + ) + ) + ) + else: + out_tensor = pb_utils.Tensor( + "text_output", np.repeat(input_np, rep_count, axis=1) + ) + responses.append(pb_utils.InferenceResponse([out_tensor])) + return responses + + def exec_decoupled(self, requests): + for request in requests: + params = json.loads(request.parameters()) + rep_count = params["REPETITION"] if "REPETITION" in params else 1 + fail_last = params["FAIL_LAST"] if "FAIL_LAST" in params else False + delay = params["DELAY"] if "DELAY" in params else None + + sender = request.get_response_sender() + input_np = pb_utils.get_input_tensor_by_name( + request, "text_input" + ).as_numpy() + stream_np = pb_utils.get_input_tensor_by_name(request, "stream").as_numpy() + out_tensor = pb_utils.Tensor("text_output", input_np) + response = pb_utils.InferenceResponse([out_tensor]) + # If stream enabled, just send multiple copies of response + # FIXME: Could split up response string into tokens, but this is simpler for now. + stream = stream_np.flatten()[0] + if stream: + for _ in range(rep_count): + if delay is not None: + time.sleep(delay) + sender.send(response) + sender.send( + None + if not fail_last + else pb_utils.InferenceResponse( + error=pb_utils.TritonError("An Error Occurred") + ), + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL, + ) + # If stream disabled, just send one response + else: + sender.send( + response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) + return None diff --git a/qa/L0_openai/example/src/tests/test_models/mock_llm/config.pbtxt b/qa/L0_openai/example/src/tests/test_models/mock_llm/config.pbtxt new file mode 100644 index 0000000000..5f665ff543 --- /dev/null +++ b/qa/L0_openai/example/src/tests/test_models/mock_llm/config.pbtxt @@ -0,0 +1,60 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +backend: "python" + +max_batch_size: 0 + +model_transaction_policy { + decoupled: True +} + +input [ + { + name: "text_input" + data_type: TYPE_STRING + dims: [ 1, 1 ] + }, + { + name: "stream" + data_type: TYPE_BOOL + dims: [ 1, 1 ] + } +] + +output [ + { + name: "text_output" + data_type: TYPE_STRING + dims: [ 1, -1 ] + } +] + +instance_group [ + { + count: 1 + kind: KIND_MODEL + } +] diff --git a/qa/L0_openai/example/src/tests/test_models/gpt2/1/model.json b/qa/L0_openai/example/src/tests/vllm_models/gpt2/1/model.json similarity index 100% rename from qa/L0_openai/example/src/tests/test_models/gpt2/1/model.json rename to qa/L0_openai/example/src/tests/vllm_models/gpt2/1/model.json diff --git a/qa/L0_openai/example/src/tests/test_models/gpt2/config.pbtxt b/qa/L0_openai/example/src/tests/vllm_models/gpt2/config.pbtxt similarity index 100% rename from qa/L0_openai/example/src/tests/test_models/gpt2/config.pbtxt rename to qa/L0_openai/example/src/tests/vllm_models/gpt2/config.pbtxt diff --git a/qa/L0_openai/example/src/utils/triton.py b/qa/L0_openai/example/src/utils/triton.py index b593d32ff4..d04fea4d98 100644 --- a/qa/L0_openai/example/src/utils/triton.py +++ b/qa/L0_openai/example/src/utils/triton.py @@ -1,36 +1,69 @@ import os import time +import typing +from dataclasses import dataclass import numpy as np import tritonserver from src.schemas.openai import CreateChatCompletionRequest, CreateCompletionRequest from src.utils.tokenizer import get_tokenizer -# TODO: Remove -SUPPORTED_BACKENDS: set = {"vllm", "tensorrtllm"} +# TODO: Refactor +# NOTE: Allow python backend for testing purposes +# TODO: How did this interact with BLS/TRTLLM models before this change? +SUPPORTED_BACKENDS: set = {"vllm", "tensorrtllm", "python"} +LLM_BACKENDS = {"vllm", "tensorrtllm"} # TODO KNOWN_MODELS = {"gpt2": "hf:gpt2"} -# TODO: Re-organize helpers +@dataclass +class TritonModelMetadata: + # Name used in Triton model repository + name: str + # Name of backend used by Triton + backend: str + + # TODO: Address typing + tokenizer: typing.Any + # Name in terms of a HuggingFace model or remote model registry name + source_name: str + # Time that model was loaded by Triton + create_time: int + + +# TODO: Refactor - this function seems to load a single model, +# but iterates through all models? def load_model(server): model = None backends = [] tokenizer = None - model_source_name = None + source_name = None + model_name = None for model_name, version in server.models().keys(): if version != -1: continue current_model = server.load(model_name) backends.append(current_model.config()["backend"]) if model_name in KNOWN_MODELS.keys(): - model = current_model - model_source_name = KNOWN_MODELS[model_name].replace("hf:", "") + source_name = KNOWN_MODELS[model_name].replace("hf:", "") tokenizer = get_tokenizer(model_source_name) - if model and tokenizer: - for backend in backends: - if backend in SUPPORTED_BACKENDS: - return model, int(time.time()), backend, tokenizer, model_source_name - return None, None, None, None, None + + create_time = int(time.time()) + backend = None + for be in backends: + if be in SUPPORTED_BACKENDS: + backend = be + break + + # TODO + # return model, model_creation_time, backend, tokenizer, model_source_name + return TritonModelMetadata( + name=model_name, + backend=backend, + tokenizer=tokenizer, + source_name=source_name, + create_time=create_time, + ) def init_tritonserver(): @@ -52,19 +85,30 @@ def init_tritonserver(): # TODO: Cleanup print("Loading Model...\n\n") - model, model_create_time, backend, tokenizer, model_source_name = load_model(server) + # model, model_create_time, backend, tokenizer, _ = load_model(server) + metadata = load_model(server) - if not (model and backend and tokenizer and model_create_time): + if not metadata.name: raise Exception("Unknown Model") - print(f"\n\nModel: {model.name} Loaded with Backend: {backend}\n\n") + if not metadata.backend: + raise Exception("Unsupported Backend") + + # NOTE: Allow no tokenizer for mock python model for testing purposes + if not metadata.tokenizer and metadata.backend in LLM_BACKENDS: + raise Exception("Unsupported Tokenizer") + + if not metadata.create_time: + raise Exception("Unknown Model Creation Time") + + print(f"\n\nModel: {metadata.name} Loaded with Backend: {metadata.backend}\n\n") # if backend == "vllm": # create_inference_request = create_vllm_inference_request # elif backend == "tensorrtllm": # create_inference_request = create_trtllm_inference_request - return server + return server, metadata def get_output(response): From 9eba9c3cc5b751dcd211065ec30656fd53bb974a Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Mon, 5 Aug 2024 18:06:37 -0700 Subject: [PATCH 06/80] Add simple tests for /v1/models and remove chat_completions test until ready --- .../src/tests/test_chat_completions.py | 47 ------------ .../example/src/tests/test_completions.py | 4 +- .../example/src/tests/test_utilities.py | 76 +++++++++++++------ qa/L0_openai/example/src/utils/triton.py | 3 +- 4 files changed, 57 insertions(+), 73 deletions(-) delete mode 100644 qa/L0_openai/example/src/tests/test_chat_completions.py diff --git a/qa/L0_openai/example/src/tests/test_chat_completions.py b/qa/L0_openai/example/src/tests/test_chat_completions.py deleted file mode 100644 index cccaef7fd2..0000000000 --- a/qa/L0_openai/example/src/tests/test_chat_completions.py +++ /dev/null @@ -1,47 +0,0 @@ -import pytest -from fastapi.testclient import TestClient -from src.api_server import app - - -# Test for Chat Completions API -@pytest.mark.parametrize( - "sampling_parameter, value", - [ - ("temperature", 0.7), - ("max_tokens", 10), - ("top_p", 0.9), - ("frequency_penalty", 0.5), - ("presence_penalty", 0.2), - ], -) -def test_chat_completions_sampling_parameters(sampling_parameter, value): - # Arrange - messages = [{"role": "user", "content": "Hello"}] - expected_response = "Hi there" - - # Act - with TestClient(app) as client: - response = client.post( - "/chat/completions", - json={ - "model": "gpt-3.5-turbo", - "messages": messages, - sampling_parameter: value, - }, - ) - - # Assert - assert response.status_code == 200 - assert response.json()["choices"][0]["message"]["content"] == expected_response - - -# Test for handling invalid chat input -def test_invalid_chat_input(): - # Act - with TestClient(app) as client: - response = client.post( - "/chat/completions", json={"model": "gpt-3.5-turbo", "messages": []} - ) - - # Assert - assert response.status_code == 400 diff --git a/qa/L0_openai/example/src/tests/test_completions.py b/qa/L0_openai/example/src/tests/test_completions.py index e68bd42f1f..7976334ba3 100644 --- a/qa/L0_openai/example/src/tests/test_completions.py +++ b/qa/L0_openai/example/src/tests/test_completions.py @@ -9,9 +9,9 @@ # TODO: May need to modify fixture scope -@pytest.fixture(scope="module", autouse=True) +@pytest.fixture(scope="function", autouse=True) def setup_model_repository(): - model_repository = Path(__file__).parent / "test_models" + model_repository = Path(__file__).parent / "vllm_models" os.environ["TRITON_MODEL_REPOSITORY"] = str(model_repository) diff --git a/qa/L0_openai/example/src/tests/test_utilities.py b/qa/L0_openai/example/src/tests/test_utilities.py index 9cbb9d2bbf..87470f83a8 100644 --- a/qa/L0_openai/example/src/tests/test_utilities.py +++ b/qa/L0_openai/example/src/tests/test_utilities.py @@ -1,14 +1,18 @@ import os -import tempfile +from pathlib import Path import pytest from fastapi.testclient import TestClient from src.api_server import app -# TODO: Use fixture for less verbose model repo prep -# @pytest.fixture(scope="session") -# def setup_model_repository(): -# pass +TEST_MODEL = "mock_llm" + + +# TODO: May need to modify fixture scope +@pytest.fixture(scope="function", autouse=True) +def setup_model_repository(): + model_repository = Path(__file__).parent / "test_models" + os.environ["TRITON_MODEL_REPOSITORY"] = str(model_repository) def test_not_found(): @@ -17,25 +21,13 @@ def test_not_found(): assert response.status_code == 404 -def test_startup_metrics(): - with tempfile.TemporaryDirectory() as model_repository: - os.environ["TRITON_MODEL_REPOSITORY"] = model_repository - with TestClient(app) as client: - response = client.get("/metrics") - assert response.status_code == 200 - # FIXME: Flesh out more - # NOTE: response.json() works even on non-json prometheus data? - assert "nv_cpu_utilization" in response.json() - # No models loaded, no per-model metrics - assert "nv_inference_count" not in response.json() +### Startup / Health ### def test_startup_success(): - with tempfile.TemporaryDirectory() as model_repository: - os.environ["TRITON_MODEL_REPOSITORY"] = model_repository - with TestClient(app) as client: - response = client.get("/health") - assert response.status_code == 200 + with TestClient(app) as client: + response = client.get("/health") + assert response.status_code == 200 def test_startup_fail(): @@ -43,5 +35,45 @@ def test_startup_fail(): with pytest.raises(Exception): # Test that FastAPI lifespan startup fails when initializing Triton # with unknown model repository. - with TestClient(app) as client: + with TestClient(app): pass + + +### Metrics ### + + +def test_startup_metrics(): + with TestClient(app) as client: + response = client.get("/metrics") + assert response.status_code == 200 + # FIXME: Flesh out more + # NOTE: response.json() works even on non-json prometheus data? + assert "nv_cpu_utilization" in response.json() + + +### Models ### + + +def test_models_list(): + # TODO: Load multiple models and make sure exactly ALL are returned + with TestClient(app) as client: + response = client.get("/v1/models") + assert response.status_code == 200 + # TODO: Flesh out + models = response.json()["data"] + assert len(models) == 1 + assert models[0]["id"] == TEST_MODEL + assert models[0]["object"] == "model" + assert models[0]["created"] > 0 + + +def test_models_get(): + # TODO: Load multiple models and make sure exactly 1 is returned + with TestClient(app) as client: + response = client.get(f"/v1/models/{TEST_MODEL}") + assert response.status_code == 200 + # TODO: Flesh out + model = response.json() + assert model["id"] == TEST_MODEL + assert model["object"] == "model" + assert model["created"] > 0 diff --git a/qa/L0_openai/example/src/utils/triton.py b/qa/L0_openai/example/src/utils/triton.py index d04fea4d98..74f0d3a949 100644 --- a/qa/L0_openai/example/src/utils/triton.py +++ b/qa/L0_openai/example/src/utils/triton.py @@ -46,7 +46,7 @@ def load_model(server): backends.append(current_model.config()["backend"]) if model_name in KNOWN_MODELS.keys(): source_name = KNOWN_MODELS[model_name].replace("hf:", "") - tokenizer = get_tokenizer(model_source_name) + tokenizer = get_tokenizer(source_name) create_time = int(time.time()) backend = None @@ -56,7 +56,6 @@ def load_model(server): break # TODO - # return model, model_creation_time, backend, tokenizer, model_source_name return TritonModelMetadata( name=model_name, backend=backend, From fb7ce72c42fce72ce2c90b47f52340de884d450e Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Tue, 6 Aug 2024 23:41:17 -0700 Subject: [PATCH 07/80] Add some basic chat completions support and testing --- qa/L0_openai/example/Dockerfile | 1 - .../example/src/routers/chat_completions.py | 37 +++++++++++++++---- qa/L0_openai/example/src/routers/models.py | 1 + .../src/tests/test_chat_completions.py | 36 ++++++++++++++++++ qa/L0_openai/example/src/utils/triton.py | 24 +++++++++--- 5 files changed, 85 insertions(+), 14 deletions(-) create mode 100644 qa/L0_openai/example/src/tests/test_chat_completions.py diff --git a/qa/L0_openai/example/Dockerfile b/qa/L0_openai/example/Dockerfile index 898f68bf95..e6dc560037 100644 --- a/qa/L0_openai/example/Dockerfile +++ b/qa/L0_openai/example/Dockerfile @@ -1,5 +1,4 @@ ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.07-vllm-python-py3 FROM ${BASE_IMAGE} -# TODO: This should be installed in Triton container by default IMO RUN pip install /opt/tritonserver/python/*.whl RUN pip install "fastapi==0.111.1" "pytest==8.1.1" diff --git a/qa/L0_openai/example/src/routers/chat_completions.py b/qa/L0_openai/example/src/routers/chat_completions.py index c0629ed92f..077820d163 100644 --- a/qa/L0_openai/example/src/routers/chat_completions.py +++ b/qa/L0_openai/example/src/routers/chat_completions.py @@ -1,4 +1,7 @@ -from fastapi import APIRouter +import time +import uuid + +from fastapi import APIRouter, HTTPException, Request from fastapi.responses import StreamingResponse from src.schemas.openai import ( ChatCompletionChoice, @@ -11,14 +14,13 @@ CreateChatCompletionStreamResponse, ObjectType, ) -from src.utils.triton import get_output +from src.utils.triton import create_vllm_inference_request, get_output router = APIRouter() def streaming_chat_completion_response(request_id, created, model, role, responses): # first chunk - choice = ChatCompletionStreamingResponseChoice( index=0, delta=ChatCompletionStreamResponseDelta( @@ -68,18 +70,33 @@ def streaming_chat_completion_response(request_id, created, model, role, respons ) def create_chat_completion( request: CreateChatCompletionRequest, + raw_request: Request, ) -> CreateChatCompletionResponse | StreamingResponse: """ Creates a model response for the given chat conversation. """ - if not model or not tokenizer or not create_inference_request: - raise Exception("Unknown Model") + # TODO: Cleanup + model_metadatas = raw_request.app.models + if not model_metadatas: + raise HTTPException(status_code=400, detail="No known models") + + metadata = model_metadatas.get(request.model) + if not metadata: + raise HTTPException(status_code=400, detail=f"Unknown model: {request.model}") + + # TODO: python models? default tokenizer? no tokenization OK? + if not metadata.tokenizer: + raise HTTPException(status_code=400, detail="No known tokenizer") + + if not metadata.backend: + raise HTTPException(status_code=400, detail="No known backend") add_generation_prompt_default = True default_role = "assistant" - if request.model != model.name and request.model != model_source_name: + model = raw_request.app.server.model(request.model) + if request.model != model.name and request.model != metadata.source_name: raise HTTPException(status_code=404, detail=f"Unknown model: {request.model}") if request.n and request.n > 1: @@ -92,7 +109,7 @@ def create_chat_completion( # TODO: Use HF tokenizer or use Jinja/templater directly? # TODO: Function Calling / tools related to this? - prompt = tokenizer.apply_chat_template( + prompt = metadata.tokenizer.apply_chat_template( conversation=conversation, tokenize=False, add_generation_prompt=add_generation_prompt_default, @@ -101,7 +118,11 @@ def create_chat_completion( request_id = f"cmpl-{uuid.uuid1()}" created = int(time.time()) - responses = model.infer(create_inference_request(model, prompt, request)) + # TODO: Associate request function / backend with model metadata + # responses = model.infer(create_inference_request(model, prompt, request)) + print(f"[DEBUG] {model=}") + print(f"[DEBUG] {metadata=}") + responses = model.infer(create_vllm_inference_request(model, prompt, request)) if request.stream: return StreamingResponse( diff --git a/qa/L0_openai/example/src/routers/models.py b/qa/L0_openai/example/src/routers/models.py index 6446fda813..716ef0719d 100644 --- a/qa/L0_openai/example/src/routers/models.py +++ b/qa/L0_openai/example/src/routers/models.py @@ -49,6 +49,7 @@ def retrieve_model(request: Request, model_name: str) -> Model: if not model: raise HTTPException(status_code=400, detail=f"Unknown model: {model_name}") + # TODO: Do we want to accept both triton name or source name interchangeably? if model_name == model.name: return Model( id=model.name, diff --git a/qa/L0_openai/example/src/tests/test_chat_completions.py b/qa/L0_openai/example/src/tests/test_chat_completions.py new file mode 100644 index 0000000000..587e0ed0ae --- /dev/null +++ b/qa/L0_openai/example/src/tests/test_chat_completions.py @@ -0,0 +1,36 @@ +import os +from pathlib import Path + +import pytest +from fastapi.testclient import TestClient +from src.api_server import app + +TEST_MODEL = "gpt2" + + +# TODO: May need to modify fixture scope +@pytest.fixture(scope="function", autouse=True) +def setup_model_repository(): + model_repository = Path(__file__).parent / "vllm_models" + os.environ["TRITON_MODEL_REPOSITORY"] = str(model_repository) + + +# Test for Chat Completions API +def test_successful_chat_completion(): + messages = [{"role": "user", "content": "Hello"}] + + # TODO: test various parameters + # TODO: test chat template - gpt2 raises error? + # TODO: test roles? + with TestClient(app) as client: + response = client.post( + "/v1/chat/completions", json={"model": TEST_MODEL, "messages": messages} + ) + + assert response.status_code == 200 + assert response.json()["choices"][0]["message"]["content"] + # TODO: Double check expected role + assert response.json()["choices"][0]["message"]["role"] == "assistant" + + +# TODO: Test for handling invalid messages or payloads diff --git a/qa/L0_openai/example/src/utils/triton.py b/qa/L0_openai/example/src/utils/triton.py index 74f0d3a949..498e0f331e 100644 --- a/qa/L0_openai/example/src/utils/triton.py +++ b/qa/L0_openai/example/src/utils/triton.py @@ -22,6 +22,8 @@ class TritonModelMetadata: name: str # Name of backend used by Triton backend: str + # Triton model object handle + model: tritonserver.Model # TODO: Address typing tokenizer: typing.Any @@ -42,8 +44,8 @@ def load_model(server): for model_name, version in server.models().keys(): if version != -1: continue - current_model = server.load(model_name) - backends.append(current_model.config()["backend"]) + model = server.load(model_name) + backends.append(model.config()["backend"]) if model_name in KNOWN_MODELS.keys(): source_name = KNOWN_MODELS[model_name].replace("hf:", "") tokenizer = get_tokenizer(source_name) @@ -59,6 +61,7 @@ def load_model(server): return TritonModelMetadata( name=model_name, backend=backend, + model=model, tokenizer=tokenizer, source_name=source_name, create_time=create_time, @@ -87,7 +90,11 @@ def init_tritonserver(): # model, model_create_time, backend, tokenizer, _ = load_model(server) metadata = load_model(server) + # TODO: pydantic validation? if not metadata.name: + raise Exception("Unknown Model Name") + + if not metadata.model: raise Exception("Unknown Model") if not metadata.backend: @@ -124,11 +131,15 @@ def create_vllm_inference_request( ): inputs = {} excludes = {"model", "stream", "messages", "prompt", "echo"} - # FIXME: It seems that some subset of these keys will cause the model to not return a response - addl_excludes = {"user", "seed", "stop", "suffix", "logprobs", "logit_bias"} + + # NOTE: The exclude_none is important, as internals may not support + # values of NoneType at this time. sampling_parameters = request.model_dump( - exclude=excludes.union(addl_excludes), + exclude=excludes, + exclude_none=True, ) + print(f"[DEBUG] {sampling_parameters=}") + inputs["text_input"] = [prompt] inputs["stream"] = [request.stream] exclude_input_in_output = True @@ -136,9 +147,12 @@ def create_vllm_inference_request( if echo: exclude_input_in_output = not echo inputs["exclude_input_in_output"] = [exclude_input_in_output] + print(f"[DEBUG] {inputs=}") + return model.create_request(inputs=inputs, parameters=sampling_parameters) +# TODO: test def create_trtllm_inference_request( model, prompt, request: CreateChatCompletionRequest | CreateCompletionRequest ): From 0cf8fae8eb027366973d38f4a4fe376bea881715 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Tue, 6 Aug 2024 23:42:00 -0700 Subject: [PATCH 08/80] WIP: Add OpenAI client test that works when server is already running, need to refactor for including server startup --- .../example/src/tests/test_openai_client.py | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 qa/L0_openai/example/src/tests/test_openai_client.py diff --git a/qa/L0_openai/example/src/tests/test_openai_client.py b/qa/L0_openai/example/src/tests/test_openai_client.py new file mode 100644 index 0000000000..f4414ce746 --- /dev/null +++ b/qa/L0_openai/example/src/tests/test_openai_client.py @@ -0,0 +1,68 @@ +from openai import OpenAI + + +# TODO: assumes already running server, so either refactor tests to work +# this way, or add TestClient to start server +def test_openai_client_completion(): + # Modify OpenAI's API key and API base to use vLLM's API server. + openai_api_key = "EMPTY" + # openai_api_base = "http://localhost:8000/v1" + openai_api_base = "http://localhost:8000/v1" + + client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) + + # TODO + models = client.models.list() + print(f"Models: {models}") + model = models.data[0].id + print(f"Model: {model}") + + completion = client.completions.create( + prompt="Hi there", + model=model, + ) + + assert completion + print(f"Completion results: {completion}") + + +# TODO: assumes already running server, so either refactor tests to work +# this way, or add TestClient to start server +def test_openai_client_chat_completion(): + # Modify OpenAI's API key and API base to use vLLM's API server. + openai_api_key = "EMPTY" + # openai_api_base = "http://localhost:8000/v1" + openai_api_base = "http://localhost:8000/v1" + + client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) + + # TODO + models = client.models.list() + print(f"Models: {models}") + model = models.data[0].id + print(f"Model: {model}") + + chat_completion = client.chat.completions.create( + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Who won the world series in 2020?"}, + { + "role": "assistant", + "content": "The Los Angeles Dodgers won the World Series in 2020.", + }, + {"role": "user", "content": "Where was it played?"}, + ], + model=model, + ) + + assert chat_completion + assert chat_completion.choices + assert chat_completion.choices[0] + assert chat_completion.choices[0].finish_reason == "stop" + print(f"Chat completion results: {chat_completion}") From 3d227dd091948b17998472861f2053972a8edbbd Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Thu, 8 Aug 2024 13:03:04 -0700 Subject: [PATCH 09/80] Flesh out /completions tests more, refactor to class fixture for running server only once per test class, skip openai client tests --- .../src/tests/test_chat_completions.py | 82 +++++++--- .../example/src/tests/test_completions.py | 153 ++++++++++++++---- .../example/src/tests/test_openai_client.py | 122 +++++++------- 3 files changed, 253 insertions(+), 104 deletions(-) diff --git a/qa/L0_openai/example/src/tests/test_chat_completions.py b/qa/L0_openai/example/src/tests/test_chat_completions.py index 587e0ed0ae..c7d41fa044 100644 --- a/qa/L0_openai/example/src/tests/test_chat_completions.py +++ b/qa/L0_openai/example/src/tests/test_chat_completions.py @@ -8,29 +8,75 @@ TEST_MODEL = "gpt2" -# TODO: May need to modify fixture scope -@pytest.fixture(scope="function", autouse=True) -def setup_model_repository(): - model_repository = Path(__file__).parent / "vllm_models" - os.environ["TRITON_MODEL_REPOSITORY"] = str(model_repository) +# TODO: Test TRTLLM too +class TestChatCompletion: + # TODO: Consider module/package scope, or join Completions tests into same file + # to run server only once for both sets of tests for faster iteration. + @pytest.fixture(scope="class", autouse=True) + def client(self): + model_repository = Path(__file__).parent / "vllm_models" + os.environ["TRITON_MODEL_REPOSITORY"] = str(model_repository) + with TestClient(app) as test_client: + yield test_client + def test_chat_completion_defaults(self, client): + messages = [{"role": "user", "content": "Hello"}] -# Test for Chat Completions API -def test_successful_chat_completion(): - messages = [{"role": "user", "content": "Hello"}] - - # TODO: test various parameters - # TODO: test chat template - gpt2 raises error? - # TODO: test roles? - with TestClient(app) as client: response = client.post( "/v1/chat/completions", json={"model": TEST_MODEL, "messages": messages} ) - assert response.status_code == 200 - assert response.json()["choices"][0]["message"]["content"] - # TODO: Double check expected role - assert response.json()["choices"][0]["message"]["role"] == "assistant" + assert response.status_code == 200 + assert response.json()["choices"][0]["message"]["content"] + # TODO: Need to test different roles? + assert response.json()["choices"][0]["message"]["role"] == "assistant" + + def test_chat_completion_parameters(self, client): + messages = [{"role": "user", "content": "Hello"}] + + # Iterate through parameters within test to avoid constant server + # startup/shutdown when using TestClient. This can likely be refactored. + request_parameters = [ + ("temperature", 0.7), + ("max_tokens", 10), + ("top_p", 0.9), + ("frequency_penalty", 0.5), + ("presence_penalty", 0.2), + ] + for parameter, value in request_parameters: + response = client.post( + "/v1/chat/completions", + json={"model": TEST_MODEL, "messages": messages, parameter: value}, + ) -# TODO: Test for handling invalid messages or payloads + assert response.status_code == 200 + assert response.json()["choices"][0]["message"]["content"] + assert response.json()["choices"][0]["message"]["role"] == "assistant" + + def test_chat_completion_no_messages(self, client): + # Message validation requires min_length of 1 + messages = [] + response = client.post( + "/v1/chat/completions", json={"model": TEST_MODEL, "messages": messages} + ) + assert response.status_code == 422 + assert ( + response.json()["detail"][0]["msg"] + == "List should have at least 1 item after validation, not 0" + ) + + def test_chat_completion_empty_message(self, client): + # Message validation requires min_length of 1 + messages = [{}] + response = client.post( + "/v1/chat/completions", json={"model": TEST_MODEL, "messages": messages} + ) + assert response.status_code == 422 + assert response.json()["detail"][0]["msg"] == "Field required" + + # TODO: Test for handling invalid messages or payloads + # TODO: test chat/instruct model? gpt2 logs error about lack of chat template + # TODO: test roles? + # TODO: function calling? + # TODO: lora / multi-lora? diff --git a/qa/L0_openai/example/src/tests/test_completions.py b/qa/L0_openai/example/src/tests/test_completions.py index 7976334ba3..bef45b3050 100644 --- a/qa/L0_openai/example/src/tests/test_completions.py +++ b/qa/L0_openai/example/src/tests/test_completions.py @@ -8,45 +8,136 @@ TEST_MODEL = "gpt2" -# TODO: May need to modify fixture scope -@pytest.fixture(scope="function", autouse=True) -def setup_model_repository(): - model_repository = Path(__file__).parent / "vllm_models" - os.environ["TRITON_MODEL_REPOSITORY"] = str(model_repository) - - -def test_completions_sampling_parameters(): - prompt = "Hello" - - # Iterate through parameters within test to avoid constant server - # startup/shutdown when using TestClient. This can likely be refactored. - request_parameters = [ - ("temperature", 0.7), - ("max_tokens", 10), - ("top_p", 0.9), - ("frequency_penalty", 0.5), - ("presence_penalty", 0.2), - ] - - with TestClient(app) as client: - for parameter, value in request_parameters: - response = client.post( +# TODO: Test TRTLLM too +class TestChatCompletion: + # TODO: Consider module/package scope, or join ChatCompletions tests into same file + # to run server only once for both sets of tests for faster iteration. + @pytest.fixture(scope="class") + def client(self): + model_repository = Path(__file__).parent / "vllm_models" + os.environ["TRITON_MODEL_REPOSITORY"] = str(model_repository) + with TestClient(app) as test_client: + yield test_client + + def test_completions_defaults(self, client): + prompt = "Hello" + + response = client.post( + "/v1/completions", + json={"model": TEST_MODEL, "prompt": prompt}, + ) + + print("Response:", response.json()) + assert response.status_code == 200 + # NOTE: Could be improved to look for certain quality of response, + # or tested with dummy identity model. + assert response.json()["choices"][0]["text"].strip() + + @pytest.mark.parametrize( + "sampling_parameter, value", + [ + ("temperature", 0.7), + ("max_tokens", 10), + ("top_p", 0.9), + ("frequency_penalty", 0.5), + ("presence_penalty", 0.2), + ("logprobs", 5), + ("logit_bias", {"0": 0}), + ], + ) + def test_completions_sampling_parameters(self, client, sampling_parameter, value): + prompt = "Hello" + + response = client.post( + "/v1/completions", + json={"model": TEST_MODEL, "prompt": prompt, sampling_parameter: value}, + ) + print("Response:", response.json()) + + # TODO: Add support and remove this check + unsupported_parameters = ["logprobs", "logit_bias"] + if sampling_parameter in unsupported_parameters: + assert response.status_code == 400 + assert response.json()["detail"] == "logit bias and log probs not supported" + return + + assert response.status_code == 200 + assert response.json()["choices"][0]["text"].strip() + + # Simple tests to verify max_tokens roughly behaves as expected + def test_completions_max_tokens(self, client): + prompt = "Hello" + responses = [] + + max_tokens = 1 + responses.append( + client.post( "/v1/completions", - json={"model": TEST_MODEL, "prompt": prompt, parameter: value}, + json={"model": TEST_MODEL, "prompt": prompt, "max_tokens": max_tokens}, ) + ) + max_tokens = 1 + responses.append( + client.post( + "/v1/completions", + json={"model": TEST_MODEL, "prompt": prompt, "max_tokens": max_tokens}, + ) + ) + max_tokens = 100 + responses.append( + client.post( + "/v1/completions", + json={"model": TEST_MODEL, "prompt": prompt, "max_tokens": max_tokens}, + ) + ) + for response in responses: print("Response:", response.json()) assert response.status_code == 200 - # TODO: Flesh out or use dummy identity model - assert response.json()["choices"][0]["text"].strip() + response1_text = responses[0].json()["choices"][0]["text"].strip().split() + response2_text = responses[1].json()["choices"][0]["text"].strip().split() + response3_text = responses[2].json()["choices"][0]["text"].strip().split() + assert len(response1_text) == len(response2_text) == 1 + assert len(response3_text) > len(response1_text) + + @pytest.mark.parametrize( + "sampling_parameter, value", + [ + ("temperature", 2.1), + ("temperature", -0.1), + ("max_tokens", -1), + ("top_p", 1.1), + ("frequency_penalty", 3), + ("frequency_penalty", -3), + ("presence_penalty", 2.1), + ("presence_penalty", -2.1), + ], + ) + def test_completions_invalid_sampling_parameters( + self, client, sampling_parameter, value + ): + prompt = "Hello" -# Test for handling invalid prompt -def test_empty_prompt(): - with TestClient(app) as client: + response = client.post( + "/v1/completions", + json={"model": TEST_MODEL, "prompt": prompt, sampling_parameter: value}, + ) + + print("Response:", response.json()) + assert response.status_code == 422 + + def test_empty_prompt(self, client): response = client.post( "/v1/completions", json={"model": TEST_MODEL, "prompt": ""} ) - # Assert - assert response.status_code == 400 + # NOTE: Should this be validated in schema instead? + # 400 Error returned in route handler + assert response.status_code == 400 + + def test_no_prompt(self, client): + response = client.post("/v1/completions", json={"model": TEST_MODEL}) + + # 422 Error returned by schema validation + assert response.status_code == 422 diff --git a/qa/L0_openai/example/src/tests/test_openai_client.py b/qa/L0_openai/example/src/tests/test_openai_client.py index f4414ce746..67d345a643 100644 --- a/qa/L0_openai/example/src/tests/test_openai_client.py +++ b/qa/L0_openai/example/src/tests/test_openai_client.py @@ -1,68 +1,80 @@ +import os +from pathlib import Path + +import pytest from openai import OpenAI -# TODO: assumes already running server, so either refactor tests to work -# this way, or add TestClient to start server -def test_openai_client_completion(): - # Modify OpenAI's API key and API base to use vLLM's API server. - openai_api_key = "EMPTY" - # openai_api_base = "http://localhost:8000/v1" - openai_api_base = "http://localhost:8000/v1" +class TestOpenAIClient: + # Start server, then with scope="class", pass execution back to each test + # until all tests in the class have been run, then clean up. + # TODO: OpenAI client requires server is already running + @pytest.fixture(scope="class", autouse=True) + def start_server(self): + model_repository = Path(__file__).parent / "vllm_models" + os.environ["TRITON_MODEL_REPOSITORY"] = str(model_repository) + + # TODO: Start server in background + # ex: https://github.com/vllm-project/vllm/blob/main/tests/utils.py + # proc = subprocess.run(...) + yield + # proc.terminate() + # proc.wait() + # proc.kill() - client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, - ) + @pytest.mark.skip(reason="Not Implemented Yet") + def test_openai_client_completion(self): + openai_api_key = "EMPTY" + openai_api_base = "http://localhost:8000/v1" - # TODO - models = client.models.list() - print(f"Models: {models}") - model = models.data[0].id - print(f"Model: {model}") + client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) - completion = client.completions.create( - prompt="Hi there", - model=model, - ) + models = client.models.list() + print(f"Models: {models}") + model = models.data[0].id + print(f"Model: {model}") - assert completion - print(f"Completion results: {completion}") + completion = client.completions.create( + prompt="Hi there", + model=model, + ) + assert completion + print(f"Completion results: {completion}") -# TODO: assumes already running server, so either refactor tests to work -# this way, or add TestClient to start server -def test_openai_client_chat_completion(): - # Modify OpenAI's API key and API base to use vLLM's API server. - openai_api_key = "EMPTY" - # openai_api_base = "http://localhost:8000/v1" - openai_api_base = "http://localhost:8000/v1" + @pytest.mark.skip(reason="Not Implemented Yet") + def test_openai_client_chat_completion(self): + openai_api_key = "EMPTY" + openai_api_base = "http://localhost:8000/v1" - client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, - ) + client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) - # TODO - models = client.models.list() - print(f"Models: {models}") - model = models.data[0].id - print(f"Model: {model}") + models = client.models.list() + print(f"Models: {models}") + model = models.data[0].id + print(f"Model: {model}") - chat_completion = client.chat.completions.create( - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "Who won the world series in 2020?"}, - { - "role": "assistant", - "content": "The Los Angeles Dodgers won the World Series in 2020.", - }, - {"role": "user", "content": "Where was it played?"}, - ], - model=model, - ) + chat_completion = client.chat.completions.create( + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Who won the world series in 2020?"}, + { + "role": "assistant", + "content": "The Los Angeles Dodgers won the World Series in 2020.", + }, + {"role": "user", "content": "Where was it played?"}, + ], + model=model, + ) - assert chat_completion - assert chat_completion.choices - assert chat_completion.choices[0] - assert chat_completion.choices[0].finish_reason == "stop" - print(f"Chat completion results: {chat_completion}") + assert chat_completion + assert chat_completion.choices + assert chat_completion.choices[0] + assert chat_completion.choices[0].finish_reason == "stop" + print(f"Chat completion results: {chat_completion}") From 4c1ac553a672ce9db6f6144ef675ac320deaa109 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Thu, 8 Aug 2024 16:07:10 -0700 Subject: [PATCH 10/80] Update chat completions schema to enforce max_tokens >= 0, and lower the default value --- qa/L0_openai/example/src/schemas/openai.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qa/L0_openai/example/src/schemas/openai.py b/qa/L0_openai/example/src/schemas/openai.py index c8c6a9b40f..488dfda3bb 100644 --- a/qa/L0_openai/example/src/schemas/openai.py +++ b/qa/L0_openai/example/src/schemas/openai.py @@ -797,8 +797,8 @@ class CreateChatCompletionRequest(BaseModel): None, description="An integer between 0 and 20 specifying the number of most likely tokens to return at each token position, each with an associated log probability. `logprobs` must be set to `true` if this parameter is used.", ) - max_tokens: Optional[int] = Field( - 8168, + max_tokens: Optional[conint(ge=0)] = Field( + 16, description="The maximum number of [tokens](/tokenizer) that can be generated in the chat completion.\n\nThe total length of input tokens and generated tokens is limited by the model's context length. [Example Python code](https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken) for counting tokens.\n", ) n: Optional[conint(ge=1, le=128)] = Field( From 5b15877e4eb4ef8cc939dc27b5142b76834ed75e Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Thu, 8 Aug 2024 17:00:08 -0700 Subject: [PATCH 11/80] Add more tests around max_tokens and temperature behavior, as well as some placeholder tests for future feature support --- qa/L0_openai/example/main.py | 5 +- .../example/src/routers/chat_completions.py | 5 + .../src/tests/test_chat_completions.py | 224 ++++++++++++++++-- .../example/src/tests/test_completions.py | 56 +++-- .../example/src/tests/test_openai_client.py | 4 + 5 files changed, 254 insertions(+), 40 deletions(-) diff --git a/qa/L0_openai/example/main.py b/qa/L0_openai/example/main.py index 12632319bd..e206d6f724 100644 --- a/qa/L0_openai/example/main.py +++ b/qa/L0_openai/example/main.py @@ -33,7 +33,7 @@ def parse_args(): triton_group.add_argument( "--model-repository", type=str, - default="/opt/tritonserver/models", + default=None, help="model repository", ) @@ -44,7 +44,8 @@ def parse_args(): args = parse_args() # NOTE: Think about other ways to pass triton args to fastapi app, # but use env vars for simplicity for now. - os.environ["TRITON_MODEL_REPOSITORY"] = args.model_repository + if args.model_repository: + os.environ["TRITON_MODEL_REPOSITORY"] = args.model_repository os.environ["TRITON_LOG_VERBOSE_LEVEL"] = str(args.tritonserver_log_level) uvicorn.run( diff --git a/qa/L0_openai/example/src/routers/chat_completions.py b/qa/L0_openai/example/src/routers/chat_completions.py index 077820d163..46d0b2524e 100644 --- a/qa/L0_openai/example/src/routers/chat_completions.py +++ b/qa/L0_openai/example/src/routers/chat_completions.py @@ -102,6 +102,11 @@ def create_chat_completion( if request.n and request.n > 1: raise HTTPException(status_code=400, detail=f"Only single choice is supported") + if request.logit_bias is not None or request.logprobs: + raise HTTPException( + status_code=400, detail="logit bias and log probs not supported" + ) + conversation = [ {"role": str(message.role), "content": str(message.content)} for message in request.messages diff --git a/qa/L0_openai/example/src/tests/test_chat_completions.py b/qa/L0_openai/example/src/tests/test_chat_completions.py index c7d41fa044..e0a31af3e6 100644 --- a/qa/L0_openai/example/src/tests/test_chat_completions.py +++ b/qa/L0_openai/example/src/tests/test_chat_completions.py @@ -6,55 +6,228 @@ from src.api_server import app TEST_MODEL = "gpt2" +TEST_PROMPT = "What is the capital of France?" # TODO: Test TRTLLM too -class TestChatCompletion: +class TestChatCompletions: # TODO: Consider module/package scope, or join Completions tests into same file # to run server only once for both sets of tests for faster iteration. @pytest.fixture(scope="class", autouse=True) def client(self): + # TODO: Test TRT-LLM models as well model_repository = Path(__file__).parent / "vllm_models" os.environ["TRITON_MODEL_REPOSITORY"] = str(model_repository) with TestClient(app) as test_client: yield test_client - def test_chat_completion_defaults(self, client): - messages = [{"role": "user", "content": "Hello"}] + def test_chat_completions_defaults(self, client): + messages = [{"role": "user", "content": TEST_PROMPT}] response = client.post( "/v1/chat/completions", json={"model": TEST_MODEL, "messages": messages} ) assert response.status_code == 200 - assert response.json()["choices"][0]["message"]["content"] - # TODO: Need to test different roles? - assert response.json()["choices"][0]["message"]["role"] == "assistant" + message = response.json()["choices"][0]["message"] + assert message["content"].strip() + assert message["role"] == "assistant" + + def test_chat_completions_system_prompt(self, client): + # NOTE: Currently just sanity check that there are no issues when a + # system role is provided. There is no test logic to measure the quality + # of the response yet. + messages = [ + {"role": "system", "content": "You are a Triton Inference Server expert."}, + {"role": "user", "content": TEST_PROMPT}, + ] + + response = client.post( + "/v1/chat/completions", json={"model": TEST_MODEL, "messages": messages} + ) - def test_chat_completion_parameters(self, client): - messages = [{"role": "user", "content": "Hello"}] + assert response.status_code == 200 + message = response.json()["choices"][0]["message"] + assert message["content"].strip() + assert message["role"] == "assistant" + + def test_chat_completions_system_prompt_only(self, client): + # No user prompt provided + messages = [ + {"role": "system", "content": "You are a Triton Inference Server expert."} + ] + + response = client.post( + "/v1/chat/completions", json={"model": TEST_MODEL, "messages": messages} + ) - # Iterate through parameters within test to avoid constant server - # startup/shutdown when using TestClient. This can likely be refactored. - request_parameters = [ + assert response.status_code == 200 + message = response.json()["choices"][0]["message"] + assert message["content"].strip() + assert message["role"] == "assistant" + + @pytest.mark.parametrize( + "sampling_parameter, value", + [ ("temperature", 0.7), ("max_tokens", 10), ("top_p", 0.9), ("frequency_penalty", 0.5), ("presence_penalty", 0.2), - ] + # logprobs is a boolean for chat completions + ("logprobs", True), + ("logit_bias", {"0": 0}), + ], + ) + def test_chat_completions_sampling_parameters( + self, client, sampling_parameter, value + ): + messages = [{"role": "user", "content": TEST_PROMPT}] + + response = client.post( + "/v1/chat/completions", + json={"model": TEST_MODEL, "messages": messages, sampling_parameter: value}, + ) + + # TODO: Add support and remove this check + unsupported_parameters = ["logprobs", "logit_bias"] + if sampling_parameter in unsupported_parameters: + assert response.status_code == 400 + assert response.json()["detail"] == "logit bias and log probs not supported" + return - for parameter, value in request_parameters: - response = client.post( + assert response.status_code == 200 + assert response.json()["choices"][0]["message"]["content"] + assert response.json()["choices"][0]["message"]["role"] == "assistant" + + @pytest.mark.parametrize( + "sampling_parameter, value", + [ + ("temperature", 2.1), + ("temperature", -0.1), + ("max_tokens", -1), + ("top_p", 1.1), + ("frequency_penalty", 3), + ("frequency_penalty", -3), + ("presence_penalty", 2.1), + ("presence_penalty", -2.1), + ], + ) + def test_chat_completions_invalid_sampling_parameters( + self, client, sampling_parameter, value + ): + messages = [{"role": "user", "content": TEST_PROMPT}] + + response = client.post( + "/v1/chat/completions", + json={"model": TEST_MODEL, "messages": messages, sampling_parameter: value}, + ) + + print("Response:", response.json()) + assert response.status_code == 422 + + # Simple tests to verify max_tokens roughly behaves as expected + def test_chat_completions_max_tokens(self, client): + responses = [] + messages = [{"role": "user", "content": TEST_PROMPT}] + payload = {"model": TEST_MODEL, "messages": messages, "max_tokens": 1} + + # Send two requests with max_tokens = 1 to check their similarity + payload["max_tokens"] = 1 + responses.append( + client.post( + "/v1/chat/completions", + json=payload, + ) + ) + responses.append( + client.post( + "/v1/chat/completions", + json=payload, + ) + ) + # Send one requests with larger max_tokens to check its dis-similarity + payload["max_tokens"] = 100 + responses.append( + client.post( + "/v1/chat/completions", + json=payload, + ) + ) + + for response in responses: + print("Response:", response.json()) + assert response.status_code == 200 + + response1_text = ( + responses[0].json()["choices"][0]["message"]["content"].strip().split() + ) + response2_text = ( + responses[1].json()["choices"][0]["message"]["content"].strip().split() + ) + response3_text = ( + responses[2].json()["choices"][0]["message"]["content"].strip().split() + ) + # Simplification: One token shouldn't be more than one space-delimited word + assert len(response1_text) == len(response2_text) == 1 + assert len(response3_text) > len(response1_text) + + @pytest.mark.parametrize( + "temperature", + [0.0, 1.0], + ) + # Simple tests to verify temperature roughly behaves as expected + def test_chat_completions_temperature(self, client, temperature): + responses = [] + messages = [{"role": "user", "content": TEST_PROMPT}] + payload = { + "model": TEST_MODEL, + "messages": messages, + "temperature": temperature, + } + + responses.append( + client.post( "/v1/chat/completions", - json={"model": TEST_MODEL, "messages": messages, parameter: value}, + json=payload, ) + ) + responses.append( + client.post( + "/v1/chat/completions", + json=payload, + ) + ) + for response in responses: + print("Response:", response.json()) assert response.status_code == 200 - assert response.json()["choices"][0]["message"]["content"] - assert response.json()["choices"][0]["message"]["role"] == "assistant" - def test_chat_completion_no_messages(self, client): + response1_text = ( + responses[0].json()["choices"][0]["message"]["content"].strip().split() + ) + response2_text = ( + responses[1].json()["choices"][0]["message"]["content"].strip().split() + ) + + # Temperature of 0.0 indicates greedy sampling, so check + # that two equivalent requests produce the same response. + if temperature == 0.0: + # NOTE: This check may be ambitious to get an exact match in all + # frameworks depending on how other parameter defaults are set, so + # it can probably be removed if it introduces flakiness. + print(f"Comparing '{response1_text}' == '{response2_text}'") + assert response1_text == response2_text + # Temperature of 1.0 indicates maximum randomness, so check + # that two equivalent requests produce different responses. + elif temperature == 1.0: + print(f"Comparing '{response1_text}' != '{response2_text}'") + assert response1_text != response2_text + # Don't bother checking values other than the extremes + else: + raise ValueError(f"Unexpected {temperature=} for this test.") + + def test_chat_completions_no_message(self, client): # Message validation requires min_length of 1 messages = [] response = client.post( @@ -66,7 +239,7 @@ def test_chat_completion_no_messages(self, client): == "List should have at least 1 item after validation, not 0" ) - def test_chat_completion_empty_message(self, client): + def test_chat_completions_empty_message(self, client): # Message validation requires min_length of 1 messages = [{}] response = client.post( @@ -75,8 +248,21 @@ def test_chat_completion_empty_message(self, client): assert response.status_code == 422 assert response.json()["detail"][0]["msg"] == "Field required" + @pytest.mark.skip(reason="Not Implemented Yet") + def test_function_calling(self): + pass + + @pytest.mark.skip(reason="Not Implemented Yet") + def test_lora(self): + pass + + @pytest.mark.skip(reason="Not Implemented Yet") + def test_multi_lora(self): + pass + # TODO: Test for handling invalid messages or payloads # TODO: test chat/instruct model? gpt2 logs error about lack of chat template # TODO: test roles? # TODO: function calling? # TODO: lora / multi-lora? + # TODO: genai-perf test? diff --git a/qa/L0_openai/example/src/tests/test_completions.py b/qa/L0_openai/example/src/tests/test_completions.py index bef45b3050..84a21e13d3 100644 --- a/qa/L0_openai/example/src/tests/test_completions.py +++ b/qa/L0_openai/example/src/tests/test_completions.py @@ -6,25 +6,24 @@ from src.api_server import app TEST_MODEL = "gpt2" +TEST_PROMPT = "The capital of France is" -# TODO: Test TRTLLM too -class TestChatCompletion: +class TestCompletions: # TODO: Consider module/package scope, or join ChatCompletions tests into same file # to run server only once for both sets of tests for faster iteration. @pytest.fixture(scope="class") def client(self): + # TODO: Test TRT-LLM models as well model_repository = Path(__file__).parent / "vllm_models" os.environ["TRITON_MODEL_REPOSITORY"] = str(model_repository) with TestClient(app) as test_client: yield test_client def test_completions_defaults(self, client): - prompt = "Hello" - response = client.post( "/v1/completions", - json={"model": TEST_MODEL, "prompt": prompt}, + json={"model": TEST_MODEL, "prompt": TEST_PROMPT}, ) print("Response:", response.json()) @@ -41,16 +40,19 @@ def test_completions_defaults(self, client): ("top_p", 0.9), ("frequency_penalty", 0.5), ("presence_penalty", 0.2), + # logprobs is an integer for completions ("logprobs", 5), ("logit_bias", {"0": 0}), ], ) def test_completions_sampling_parameters(self, client, sampling_parameter, value): - prompt = "Hello" - response = client.post( "/v1/completions", - json={"model": TEST_MODEL, "prompt": prompt, sampling_parameter: value}, + json={ + "model": TEST_MODEL, + "prompt": TEST_PROMPT, + sampling_parameter: value, + }, ) print("Response:", response.json()) @@ -66,28 +68,29 @@ def test_completions_sampling_parameters(self, client, sampling_parameter, value # Simple tests to verify max_tokens roughly behaves as expected def test_completions_max_tokens(self, client): - prompt = "Hello" responses = [] + payload = {"model": TEST_MODEL, "prompt": TEST_PROMPT, "max_tokens": 1} - max_tokens = 1 + # Send two requests with max_tokens = 1 to check their similarity + payload["max_tokens"] = 1 responses.append( client.post( "/v1/completions", - json={"model": TEST_MODEL, "prompt": prompt, "max_tokens": max_tokens}, + json=payload, ) ) - max_tokens = 1 responses.append( client.post( "/v1/completions", - json={"model": TEST_MODEL, "prompt": prompt, "max_tokens": max_tokens}, + json=payload, ) ) - max_tokens = 100 + # Send one requests with larger max_tokens to check its dis-similarity + payload["max_tokens"] = 100 responses.append( client.post( "/v1/completions", - json={"model": TEST_MODEL, "prompt": prompt, "max_tokens": max_tokens}, + json=payload, ) ) @@ -98,6 +101,7 @@ def test_completions_max_tokens(self, client): response1_text = responses[0].json()["choices"][0]["text"].strip().split() response2_text = responses[1].json()["choices"][0]["text"].strip().split() response3_text = responses[2].json()["choices"][0]["text"].strip().split() + # Simplification: One token shouldn't be more than one space-delimited word assert len(response1_text) == len(response2_text) == 1 assert len(response3_text) > len(response1_text) @@ -117,17 +121,23 @@ def test_completions_max_tokens(self, client): def test_completions_invalid_sampling_parameters( self, client, sampling_parameter, value ): - prompt = "Hello" - response = client.post( "/v1/completions", - json={"model": TEST_MODEL, "prompt": prompt, sampling_parameter: value}, + json={ + "model": TEST_MODEL, + "prompt": TEST_PROMPT, + sampling_parameter: value, + }, ) print("Response:", response.json()) assert response.status_code == 422 - def test_empty_prompt(self, client): + def test_completions_no_prompt(self, client): + response = client.post("/v1/completions", json={"model": TEST_MODEL}) + assert response.status_code == 422 + + def test_completions_empty_prompt(self, client): response = client.post( "/v1/completions", json={"model": TEST_MODEL, "prompt": ""} ) @@ -141,3 +151,11 @@ def test_no_prompt(self, client): # 422 Error returned by schema validation assert response.status_code == 422 + + @pytest.mark.skip(reason="Not Implemented Yet") + def test_lora(self): + pass + + @pytest.mark.skip(reason="Not Implemented Yet") + def test_multi_lora(self): + pass diff --git a/qa/L0_openai/example/src/tests/test_openai_client.py b/qa/L0_openai/example/src/tests/test_openai_client.py index 67d345a643..c83d69a9b1 100644 --- a/qa/L0_openai/example/src/tests/test_openai_client.py +++ b/qa/L0_openai/example/src/tests/test_openai_client.py @@ -78,3 +78,7 @@ def test_openai_client_chat_completion(self): assert chat_completion.choices[0] assert chat_completion.choices[0].finish_reason == "stop" print(f"Chat completion results: {chat_completion}") + + @pytest.mark.skip(reason="Not Implemented Yet") + def test_openai_client_function_calling(self): + pass From f9f4b077a5e09121602c257724f0f98a7132c62c Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Fri, 9 Aug 2024 14:26:59 -0700 Subject: [PATCH 12/80] Remove unused parts from tokenizer.py --- qa/L0_openai/example/src/utils/tokenizer.py | 75 +-------------------- 1 file changed, 1 insertion(+), 74 deletions(-) diff --git a/qa/L0_openai/example/src/utils/tokenizer.py b/qa/L0_openai/example/src/utils/tokenizer.py index 0011172c19..a60783a5f9 100644 --- a/qa/L0_openai/example/src/utils/tokenizer.py +++ b/qa/L0_openai/example/src/utils/tokenizer.py @@ -1,18 +1,7 @@ -import os from typing import Optional, Union from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast -# from vllm.config import VLLM_USE_MODELSCOPE -# from vllm.logger import init_logger -# from vllm.lora.request import LoRARequest -# from vllm.transformers_utils.tokenizers import BaichuanTokenizer -# from vllm.utils import make_async - -# logger = init_logger(__name__) - -VLLM_USE_MODELSCOPE = False - def get_cached_tokenizer( tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast] @@ -62,23 +51,6 @@ def get_tokenizer( **kwargs, ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: """Gets a tokenizer for the given model name via Huggingface/modelscope.""" - if VLLM_USE_MODELSCOPE: - # download model from ModelScope hub, - # lazy import so that modelscope is not required for normal use. - # pylint: disable=C. - from modelscope.hub.snapshot_download import snapshot_download - - # Only set the tokenizer here, model will be downloaded on the workers. - if not os.path.exists(tokenizer_name): - tokenizer_path = snapshot_download( - model_id=tokenizer_name, - cache_dir=download_dir, - revision=tokenizer_revision, - # Ignore weights - we only need the tokenizer. - ignore_file_pattern=["*.pt", "*.safetensors", "*.bin"], - ) - tokenizer_name = tokenizer_path - if tokenizer_mode == "slow": if kwargs.get("use_fast", False): raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.") @@ -93,32 +65,8 @@ def get_tokenizer( **kwargs, ) except ValueError as e: - # If the error pertains to the tokenizer class not existing or not - # currently being imported, suggest using the --trust-remote-code flag. - if not trust_remote_code and ( - "does not exist or is not currently imported." in str(e) - or "requires you to execute the tokenizer file" in str(e) - ): - err_msg = ( - "Failed to load the tokenizer. If the tokenizer is a custom " - "tokenizer not yet available in the HuggingFace transformers " - "library, consider setting `trust_remote_code=True` in LLM " - "or using the `--trust-remote-code` flag in the CLI." - ) - raise RuntimeError(err_msg) from e - else: - raise e + raise e except AttributeError as e: - # if "BaichuanTokenizer" in str(e): - # # This is for the error "'BaichuanTokenizer' object has no - # # attribute 'sp_model'". - # tokenizer = BaichuanTokenizer.from_pretrained( - # tokenizer_name, - # *args, - # trust_remote_code=trust_remote_code, - # tokenizer_revision=tokenizer_revision, - # **kwargs) - # else: raise e if not isinstance(tokenizer, PreTrainedTokenizerFast): @@ -127,24 +75,3 @@ def get_tokenizer( "slowdown. Consider using a fast tokenizer instead." ) return get_cached_tokenizer(tokenizer) - - -# def get_lora_tokenizer(lora_request: LoRARequest, *args, -# **kwargs) -> Optional[PreTrainedTokenizer]: -# if lora_request is None: -# return None -# try: -# tokenizer = get_tokenizer(lora_request.lora_local_path, *args, -# **kwargs) -# except OSError as e: -# # No tokenizer was found in the LoRA folder, -# # use base model tokenizer -# logger.warning( -# f"No tokenizer found in {lora_request.lora_local_path}, " -# "using base model tokenizer instead. " -# f"(Exception: {str(e)})") -# tokenizer = None -# return tokenizer - - -# get_lora_tokenizer_async = make_async(get_lora_tokenizer) From 773aee0c1d8f3e5b9fdb6ce675a9a8ff55cd7646 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Tue, 13 Aug 2024 17:43:46 -0700 Subject: [PATCH 13/80] All existing tests passing for both TRT-LLM and vLLM, updated model load logic to track/list all models for now, marked xfail test for known issue with TRT-LLM temperature, added logic to support testing both TRT-LLM and vLLM based on environment, added openai dep to Dockerfile but skipping openai tests for now --- qa/L0_openai/example/Dockerfile | 2 +- qa/L0_openai/example/main.py | 3 +- qa/L0_openai/example/src/api_server.py | 67 +- .../example/src/routers/chat_completions.py | 34 +- .../example/src/routers/completions.py | 37 +- qa/L0_openai/example/src/routers/models.py | 23 +- .../{utilities.py => observability.py} | 0 .../tests/tensorrtllm_models/ensemble/1/.tmp | 0 .../tensorrtllm_models/ensemble/config.pbtxt | 470 +++++++++++ .../postprocessing/1/model.py | 246 ++++++ .../postprocessing/config.pbtxt | 113 +++ .../preprocessing/1/model.py | 418 +++++++++ .../preprocessing/config.pbtxt | 156 ++++ .../tensorrt_llm/1/.gitkeep | 0 .../tensorrt_llm/1/model.py | 797 ++++++++++++++++++ .../tensorrt_llm/config.pbtxt | 542 ++++++++++++ .../tensorrt_llm_bls/1/lib/decode.py | 347 ++++++++ .../tensorrt_llm_bls/1/lib/triton_decoder.py | 478 +++++++++++ .../tensorrt_llm_bls/1/model.py | 137 +++ .../tensorrt_llm_bls/config.pbtxt | 252 ++++++ .../src/tests/test_chat_completions.py | 232 ++++- .../example/src/tests/test_completions.py | 198 ++++- .../example/src/tests/test_observability.py | 67 ++ .../example/src/tests/test_utilities.py | 79 -- .../src/tests/vllm_models/gpt2/1/model.json | 1 - .../llama-3-8b-instruct/1/model.json | 1 + .../config.pbtxt | 0 qa/L0_openai/example/src/utils/triton.py | 167 ++-- 28 files changed, 4599 insertions(+), 268 deletions(-) rename qa/L0_openai/example/src/routers/{utilities.py => observability.py} (100%) create mode 100644 qa/L0_openai/example/src/tests/tensorrtllm_models/ensemble/1/.tmp create mode 100644 qa/L0_openai/example/src/tests/tensorrtllm_models/ensemble/config.pbtxt create mode 100644 qa/L0_openai/example/src/tests/tensorrtllm_models/postprocessing/1/model.py create mode 100644 qa/L0_openai/example/src/tests/tensorrtllm_models/postprocessing/config.pbtxt create mode 100644 qa/L0_openai/example/src/tests/tensorrtllm_models/preprocessing/1/model.py create mode 100644 qa/L0_openai/example/src/tests/tensorrtllm_models/preprocessing/config.pbtxt create mode 100644 qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm/1/.gitkeep create mode 100644 qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm/1/model.py create mode 100644 qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm/config.pbtxt create mode 100644 qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/decode.py create mode 100644 qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/triton_decoder.py create mode 100644 qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/model.py create mode 100644 qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm_bls/config.pbtxt create mode 100644 qa/L0_openai/example/src/tests/test_observability.py delete mode 100644 qa/L0_openai/example/src/tests/test_utilities.py delete mode 100644 qa/L0_openai/example/src/tests/vllm_models/gpt2/1/model.json create mode 100644 qa/L0_openai/example/src/tests/vllm_models/llama-3-8b-instruct/1/model.json rename qa/L0_openai/example/src/tests/vllm_models/{gpt2 => llama-3-8b-instruct}/config.pbtxt (100%) diff --git a/qa/L0_openai/example/Dockerfile b/qa/L0_openai/example/Dockerfile index e6dc560037..64d5637432 100644 --- a/qa/L0_openai/example/Dockerfile +++ b/qa/L0_openai/example/Dockerfile @@ -1,4 +1,4 @@ ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.07-vllm-python-py3 FROM ${BASE_IMAGE} RUN pip install /opt/tritonserver/python/*.whl -RUN pip install "fastapi==0.111.1" "pytest==8.1.1" +RUN pip install "fastapi==0.111.1" "pytest==8.1.1" "openai==1.40.6" diff --git a/qa/L0_openai/example/main.py b/qa/L0_openai/example/main.py index e206d6f724..81b4f8066a 100644 --- a/qa/L0_openai/example/main.py +++ b/qa/L0_openai/example/main.py @@ -2,7 +2,7 @@ import os import uvicorn -from src.api_server import app +from src.api_server import init_app def parse_args(): @@ -48,6 +48,7 @@ def parse_args(): os.environ["TRITON_MODEL_REPOSITORY"] = args.model_repository os.environ["TRITON_LOG_VERBOSE_LEVEL"] = str(args.tritonserver_log_level) + app = init_app() uvicorn.run( app, host=args.host, diff --git a/qa/L0_openai/example/src/api_server.py b/qa/L0_openai/example/src/api_server.py index 10b740e978..1b7543a4a0 100644 --- a/qa/L0_openai/example/src/api_server.py +++ b/qa/L0_openai/example/src/api_server.py @@ -1,14 +1,12 @@ from __future__ import annotations from contextlib import asynccontextmanager -from typing import Union import tritonserver from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware -from src.routers import chat_completions, completions, models, utilities +from src.routers import chat_completions, completions, models, observability from src.utils.triton import init_tritonserver -from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast def add_cors_middleware(app: FastAPI): @@ -31,11 +29,9 @@ def add_cors_middleware(app: FastAPI): async def lifespan(app: FastAPI): print("Starting FastAPI app lifespan...") # Start the tritonserver on FastAPI app startup - server, model_metadata = init_tritonserver() + server, model_metadatas = init_tritonserver() app.server = server - # TODO: Clean up or refactor this flow to store models for /v1/models endpoints - app.models = {} - app.models[model_metadata.name] = model_metadata + app.models = {metadata.name: metadata for metadata in model_metadatas} yield @@ -43,37 +39,36 @@ async def lifespan(app: FastAPI): print("Shutting down FastAPI app lifespan...") if app.server: print("Shutting down Triton Inference Server...") - app.server.stop() + try: + app.server.stop() + # Log error, but don't raise on shutdown + except tritonserver.InternalError as e: + print(e) -app = FastAPI( - title="OpenAI API", - description="The OpenAI REST API. Please see https://platform.openai.com/docs/api-reference for more details.", - version="2.0.0", - termsOfService="https://openai.com/policies/terms-of-use", - contact={"name": "OpenAI Support", "url": "https://help.openai.com/"}, - license={ - "name": "MIT", - "url": "https://github.com/openai/openai-openapi/blob/master/LICENSE", - }, - # TODO: Do we need this? This affects the endpoints used in /docs endpoint. - # servers=[{"url": "https://api.openai.com/v1"}], - lifespan=lifespan, -) +def init_app(): + app = FastAPI( + title="OpenAI API", + description="The OpenAI REST API. Please see https://platform.openai.com/docs/api-reference for more details.", + version="2.0.0", + termsOfService="https://openai.com/policies/terms-of-use", + contact={"name": "OpenAI Support", "url": "https://help.openai.com/"}, + license={ + "name": "MIT", + "url": "https://github.com/openai/openai-openapi/blob/master/LICENSE", + }, + lifespan=lifespan, + ) + + app.include_router(observability.router) + app.include_router(models.router) + app.include_router(completions.router) + app.include_router(chat_completions.router) -app.include_router(utilities.router) -app.include_router(models.router) -app.include_router(completions.router) -app.include_router(chat_completions.router) + # NOTE: For debugging purposes, should generally be restricted or removed + add_cors_middleware(app) -# NOTE: For debugging purposes, should generally be restricted or removed -add_cors_middleware(app) + # TODO: Add common logger and use logger.debug in place of current print + # statements for debugging purposes. -# TODO: Refactor/remove globals where not necessary -server: tritonserver.Server -model: tritonserver.Model -model_source_name: str -model_create_time: int -backend: str -tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast] -create_inference_request = None + return app diff --git a/qa/L0_openai/example/src/routers/chat_completions.py b/qa/L0_openai/example/src/routers/chat_completions.py index 46d0b2524e..189836ad5b 100644 --- a/qa/L0_openai/example/src/routers/chat_completions.py +++ b/qa/L0_openai/example/src/routers/chat_completions.py @@ -14,7 +14,7 @@ CreateChatCompletionStreamResponse, ObjectType, ) -from src.utils.triton import create_vllm_inference_request, get_output +from src.utils.triton import get_output router = APIRouter() @@ -77,6 +77,9 @@ def create_chat_completion( """ # TODO: Cleanup + print(f"[DEBUG] Available model metadata: {raw_request.app.models.keys()=}") + print(f"[DEBUG] Fetching model metadata for {request.model=}") + model_metadatas = raw_request.app.models if not model_metadatas: raise HTTPException(status_code=400, detail="No known models") @@ -85,22 +88,29 @@ def create_chat_completion( if not metadata: raise HTTPException(status_code=400, detail=f"Unknown model: {request.model}") - # TODO: python models? default tokenizer? no tokenization OK? + if not metadata.request_convert_fn: + raise HTTPException( + status_code=400, detail=f"Unknown request format for model: {request.model}" + ) + if not metadata.tokenizer: - raise HTTPException(status_code=400, detail="No known tokenizer") + raise HTTPException(status_code=400, detail="Unknown tokenizer") if not metadata.backend: - raise HTTPException(status_code=400, detail="No known backend") + raise HTTPException(status_code=400, detail="Unknown backend") add_generation_prompt_default = True default_role = "assistant" - model = raw_request.app.server.model(request.model) - if request.model != model.name and request.model != metadata.source_name: - raise HTTPException(status_code=404, detail=f"Unknown model: {request.model}") + triton_model = raw_request.app.server.model(request.model) + if request.model != triton_model.name: + raise HTTPException( + status_code=400, + detail=f"Mismatched model name: {request.model} != {triton_model.name}", + ) if request.n and request.n > 1: - raise HTTPException(status_code=400, detail=f"Only single choice is supported") + raise HTTPException(status_code=400, detail="Only single choice is supported") if request.logit_bias is not None or request.logprobs: raise HTTPException( @@ -123,11 +133,9 @@ def create_chat_completion( request_id = f"cmpl-{uuid.uuid1()}" created = int(time.time()) - # TODO: Associate request function / backend with model metadata - # responses = model.infer(create_inference_request(model, prompt, request)) - print(f"[DEBUG] {model=}") - print(f"[DEBUG] {metadata=}") - responses = model.infer(create_vllm_inference_request(model, prompt, request)) + responses = triton_model.infer( + metadata.request_convert_fn(triton_model, prompt, request) + ) if request.stream: return StreamingResponse( diff --git a/qa/L0_openai/example/src/routers/completions.py b/qa/L0_openai/example/src/routers/completions.py index 02e1bfb730..71954574cd 100644 --- a/qa/L0_openai/example/src/routers/completions.py +++ b/qa/L0_openai/example/src/routers/completions.py @@ -10,7 +10,7 @@ FinishReason, ObjectType, ) -from src.utils.triton import create_vllm_inference_request, get_output +from src.utils.triton import get_output router = APIRouter() @@ -49,22 +49,29 @@ def create_completion( """ if not request.model: - raise Exception("No Model Provided") + raise Exception("Request must provide a valid 'model'") - model = raw_request.app.server.model(request.model) + print(f"[DEBUG] Available model metadata: {raw_request.app.models.keys()=}") + print(f"[DEBUG] Fetching model metadata for {request.model=}") + metadata = raw_request.app.models.get(request.model) - # if not not tokenizer or not create_inference_request: - # raise Exception("Unknown Model") + if not metadata: + raise HTTPException( + status_code=400, detail=f"Unknown model metadata for model: {request.model}" + ) + + if not metadata.request_convert_fn: + raise HTTPException( + status_code=400, detail=f"Unknown request format for model: {request.model}" + ) if request.suffix is not None: raise HTTPException(status_code=400, detail="suffix is not currently supported") - if request.model != model.name: + if request.model != metadata.name: raise HTTPException(status_code=404, detail=f"Unknown model: {request.model}") if not request.prompt: - # TODO: Needed? - # request.prompt = "<|endoftext|>" raise HTTPException(status_code=400, detail="prompt must be non-empty") # Currently only support single string as input @@ -73,6 +80,9 @@ def create_completion( status_code=400, detail="only single string input is supported" ) + if request.n and request.n > 1: + raise HTTPException(status_code=400, detail="Only single choice is supported") + if request.logit_bias is not None or request.logprobs is not None: raise HTTPException( status_code=400, detail="logit bias and log probs not supported" @@ -81,14 +91,13 @@ def create_completion( request_id = f"cmpl-{uuid.uuid1()}" created = int(time.time()) - # TODO: Determine backend, using hard-coded vllm for simplicity - # responses = model.infer(create_inference_request(model, request.prompt, request)) - responses = model.infer( - create_vllm_inference_request(model, request.prompt, request) + triton_model = raw_request.app.server.model(request.model) + responses = triton_model.infer( + metadata.request_convert_fn(triton_model, request.prompt, request) ) if request.stream: return StreamingResponse( - streaming_completion_response(request_id, created, model.name, responses) + streaming_completion_response(request_id, created, metadata.name, responses) ) response = list(responses)[0] text = get_output(response) @@ -105,5 +114,5 @@ def create_completion( system_fingerprint=None, object=ObjectType.text_completion, created=created, - model=model.name, + model=metadata.name, ) diff --git a/qa/L0_openai/example/src/routers/models.py b/qa/L0_openai/example/src/routers/models.py index 716ef0719d..6798a52289 100644 --- a/qa/L0_openai/example/src/routers/models.py +++ b/qa/L0_openai/example/src/routers/models.py @@ -12,13 +12,13 @@ def list_models(request: Request) -> ListModelsResponse: """ Lists the currently available models, and provides basic information about each one such as the owner and availability. """ - models = request.app.models - if not models: + model_metadatas = request.app.models + if not model_metadatas: raise HTTPException(status_code=400, detail="No known models") model_list = [] - for model in models: - metadata = models[model] + for model in model_metadatas: + metadata = model_metadatas[model] if not metadata: raise HTTPException( status_code=400, detail=f"No metadata for model: {model}" @@ -41,15 +41,14 @@ def retrieve_model(request: Request, model_name: str) -> Model: """ Retrieves a model instance, providing basic information about the model such as the owner and permissioning. """ - models = request.app.models - if not models: + model_metadatas = request.app.models + if not model_metadatas: raise HTTPException(status_code=400, detail="No known models") - model = models.get(model_name) + model = model_metadatas.get(model_name) if not model: raise HTTPException(status_code=400, detail=f"Unknown model: {model_name}") - # TODO: Do we want to accept both triton name or source name interchangeably? if model_name == model.name: return Model( id=model.name, @@ -58,12 +57,4 @@ def retrieve_model(request: Request, model_name: str) -> Model: owned_by=OWNED_BY, ) - if model_name == model.source_name: - return Model( - id=model.source_name, - created=model.create_time, - object=ObjectType.model, - owned_by=OWNED_BY, - ) - raise HTTPException(status_code=404, detail=f"Unknown model: {model_name}") diff --git a/qa/L0_openai/example/src/routers/utilities.py b/qa/L0_openai/example/src/routers/observability.py similarity index 100% rename from qa/L0_openai/example/src/routers/utilities.py rename to qa/L0_openai/example/src/routers/observability.py diff --git a/qa/L0_openai/example/src/tests/tensorrtllm_models/ensemble/1/.tmp b/qa/L0_openai/example/src/tests/tensorrtllm_models/ensemble/1/.tmp new file mode 100644 index 0000000000..e69de29bb2 diff --git a/qa/L0_openai/example/src/tests/tensorrtllm_models/ensemble/config.pbtxt b/qa/L0_openai/example/src/tests/tensorrtllm_models/ensemble/config.pbtxt new file mode 100644 index 0000000000..b82990446d --- /dev/null +++ b/qa/L0_openai/example/src/tests/tensorrtllm_models/ensemble/config.pbtxt @@ -0,0 +1,470 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "ensemble" +platform: "ensemble" +max_batch_size: 64 +input [ + { + name: "text_input" + data_type: TYPE_STRING + dims: [ -1 ] + }, + { + name: "decoder_text_input" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true + }, + { + name: "max_tokens" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "bad_words" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true + }, + { + name: "stop_words" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true + }, + { + name: "end_id" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + }, + { + name: "pad_id" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + }, + { + name: "top_k" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + }, + { + name: "top_p" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "temperature" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "length_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "repetition_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "min_length" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + }, + { + name: "presence_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "frequency_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "random_seed" + data_type: TYPE_UINT64 + dims: [ 1 ] + optional: true + }, + { + name: "return_log_probs" + data_type: TYPE_BOOL + dims: [ 1 ] + optional: true + }, + { + name: "return_context_logits" + data_type: TYPE_BOOL + dims: [ 1 ] + optional: true + }, + { + name: "return_generation_logits" + data_type: TYPE_BOOL + dims: [ 1 ] + optional: true + }, + { + name: "beam_width" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + }, + { + name: "stream" + data_type: TYPE_BOOL + dims: [ 1 ] + optional: true + }, + { + name: "prompt_embedding_table" + data_type: TYPE_FP16 + dims: [ -1, -1 ] + optional: true + }, + { + name: "prompt_vocab_size" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + }, + { + name: "embedding_bias_words" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true + }, + { + name: "embedding_bias_weights" + data_type: TYPE_FP32 + dims: [ -1 ] + optional: true + } +] +output [ + { + name: "text_output" + data_type: TYPE_STRING + dims: [ -1 ] + }, + { + name: "cum_log_probs" + data_type: TYPE_FP32 + dims: [ -1 ] + }, + { + name: "output_log_probs" + data_type: TYPE_FP32 + dims: [ -1, -1 ] + }, + { + name: "context_logits" + data_type: TYPE_FP32 + dims: [ -1, -1 ] + }, + { + name: "generation_logits" + data_type: TYPE_FP32 + dims: [ -1, -1, -1 ] + } +] +ensemble_scheduling { + step [ + { + model_name: "preprocessing" + model_version: -1 + input_map { + key: "QUERY" + value: "text_input" + } + input_map { + key: "DECODER_QUERY" + value: "decoder_text_input" + } + input_map { + key: "REQUEST_OUTPUT_LEN" + value: "max_tokens" + } + input_map { + key: "BAD_WORDS_DICT" + value: "bad_words" + } + input_map { + key: "STOP_WORDS_DICT" + value: "stop_words" + } + input_map { + key: "EMBEDDING_BIAS_WORDS" + value: "embedding_bias_words" + } + input_map { + key: "EMBEDDING_BIAS_WEIGHTS" + value: "embedding_bias_weights" + } + input_map { + key: "END_ID" + value: "end_id" + } + input_map { + key: "PAD_ID" + value: "pad_id" + } + output_map { + key: "REQUEST_INPUT_LEN" + value: "_REQUEST_INPUT_LEN" + } + output_map { + key: "INPUT_ID" + value: "_INPUT_ID" + } + output_map { + key: "REQUEST_DECODER_INPUT_LEN" + value: "_REQUEST_DECODER_INPUT_LEN" + } + output_map { + key: "DECODER_INPUT_ID" + value: "_DECODER_INPUT_ID" + } + output_map { + key: "REQUEST_OUTPUT_LEN" + value: "_REQUEST_OUTPUT_LEN" + } + output_map { + key: "STOP_WORDS_IDS" + value: "_STOP_WORDS_IDS" + } + output_map { + key: "BAD_WORDS_IDS" + value: "_BAD_WORDS_IDS" + } + output_map { + key: "EMBEDDING_BIAS" + value: "_EMBEDDING_BIAS" + } + output_map { + key: "OUT_END_ID" + value: "_PREPROCESSOR_END_ID" + } + output_map { + key: "OUT_PAD_ID" + value: "_PREPROCESSOR_PAD_ID" + } + }, + { + model_name: "tensorrt_llm" + model_version: -1 + input_map { + key: "input_ids" + value: "_INPUT_ID" + } + input_map { + key: "decoder_input_ids" + value: "_DECODER_INPUT_ID" + } + input_map { + key: "input_lengths" + value: "_REQUEST_INPUT_LEN" + } + input_map { + key: "decoder_input_lengths" + value: "_REQUEST_DECODER_INPUT_LEN" + } + input_map { + key: "request_output_len" + value: "_REQUEST_OUTPUT_LEN" + } + input_map { + key: "end_id" + value: "_PREPROCESSOR_END_ID" + } + input_map { + key: "pad_id" + value: "_PREPROCESSOR_PAD_ID" + } + input_map { + key: "embedding_bias" + value: "_EMBEDDING_BIAS" + } + input_map { + key: "runtime_top_k" + value: "top_k" + } + input_map { + key: "runtime_top_p" + value: "top_p" + } + input_map { + key: "temperature" + value: "temperature" + } + input_map { + key: "len_penalty" + value: "length_penalty" + } + input_map { + key: "repetition_penalty" + value: "repetition_penalty" + } + input_map { + key: "min_length" + value: "min_length" + } + input_map { + key: "presence_penalty" + value: "presence_penalty" + } + input_map { + key: "frequency_penalty" + value: "frequency_penalty" + } + input_map { + key: "random_seed" + value: "random_seed" + } + input_map { + key: "return_log_probs" + value: "return_log_probs" + } + input_map { + key: "return_context_logits" + value: "return_context_logits" + } + input_map { + key: "return_generation_logits" + value: "return_generation_logits" + } + input_map { + key: "beam_width" + value: "beam_width" + } + input_map { + key: "streaming" + value: "stream" + } + input_map { + key: "prompt_embedding_table" + value: "prompt_embedding_table" + } + input_map { + key: "prompt_vocab_size" + value: "prompt_vocab_size" + } + input_map { + key: "stop_words_list" + value: "_STOP_WORDS_IDS" + } + input_map { + key: "bad_words_list" + value: "_BAD_WORDS_IDS" + } + output_map { + key: "output_ids" + value: "_TOKENS_BATCH" + } + output_map { + key: "sequence_length" + value: "_SEQUENCE_LENGTH" + }, + output_map { + key: "cum_log_probs" + value: "_CUM_LOG_PROBS" + } + output_map { + key: "output_log_probs" + value: "_OUTPUT_LOG_PROBS" + }, + output_map { + key: "context_logits" + value: "_CONTEXT_LOGITS" + }, + output_map { + key: "generation_logits" + value: "_GENERATION_LOGITS" + } + }, + { + model_name: "postprocessing" + model_version: -1 + input_map { + key: "TOKENS_BATCH" + value: "_TOKENS_BATCH" + } + input_map { + key: "CUM_LOG_PROBS" + value: "_CUM_LOG_PROBS" + } + input_map { + key: "OUTPUT_LOG_PROBS" + value: "_OUTPUT_LOG_PROBS" + } + input_map { + key: "CONTEXT_LOGITS" + value: "_CONTEXT_LOGITS" + } + input_map { + key: "GENERATION_LOGITS" + value: "_GENERATION_LOGITS" + } + input_map { + key: "SEQUENCE_LENGTH" + value: "_SEQUENCE_LENGTH" + } + output_map { + key: "OUTPUT" + value: "text_output" + } + output_map { + key: "OUT_OUTPUT_LOG_PROBS" + value: "output_log_probs" + } + output_map { + key: "OUT_CUM_LOG_PROBS" + value: "cum_log_probs" + } + output_map { + key: "OUT_CONTEXT_LOGITS" + value: "context_logits" + } + output_map { + key: "OUT_GENERATION_LOGITS" + value: "generation_logits" + } + } + ] +} diff --git a/qa/L0_openai/example/src/tests/tensorrtllm_models/postprocessing/1/model.py b/qa/L0_openai/example/src/tests/tensorrtllm_models/postprocessing/1/model.py new file mode 100644 index 0000000000..0812e19b3e --- /dev/null +++ b/qa/L0_openai/example/src/tests/tensorrtllm_models/postprocessing/1/model.py @@ -0,0 +1,246 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +import numpy as np +import triton_python_backend_utils as pb_utils +from transformers import AutoTokenizer + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + # Parse model configs + model_config = json.loads(args["model_config"]) + tokenizer_dir = model_config["parameters"]["tokenizer_dir"]["string_value"] + + skip_special_tokens = model_config["parameters"].get("skip_special_tokens") + if skip_special_tokens is not None: + skip_special_tokens_str = skip_special_tokens["string_value"].lower() + if skip_special_tokens_str in [ + "true", + "false", + "1", + "0", + "t", + "f", + "y", + "n", + "yes", + "no", + ]: + self.skip_special_tokens = skip_special_tokens_str in [ + "true", + "1", + "t", + "y", + "yes", + ] + else: + print( + f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is {skip_special_tokens['string_value']}). Set it as True by default." + ) + self.skip_special_tokens = True + else: + print( + f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens'. Set it as True by default." + ) + self.skip_special_tokens = True + + self.tokenizer = AutoTokenizer.from_pretrained( + tokenizer_dir, legacy=False, padding_side="left", trust_remote_code=True + ) + if not self.tokenizer.pad_token: + self.tokenizer.pad_token = self.tokenizer.eos_token + + # Parse model output configs + output_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT") + + # Convert Triton types to numpy types + self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"]) + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. Depending on the batching configuration (e.g. Dynamic + Batching) used, `requests` may contain multiple requests. Every + Python model, must create one pb_utils.InferenceResponse for every + pb_utils.InferenceRequest in `requests`. If there is an error, you can + set the error argument when creating a pb_utils.InferenceResponse. + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + + responses = [] + + # Every Python backend must iterate over everyone of the requests + # and create a pb_utils.InferenceResponse for each of them. + for idx, request in enumerate(requests): + # Get input tensors + tokens_batch = pb_utils.get_input_tensor_by_name( + request, "TOKENS_BATCH" + ).as_numpy() + + # Get sequence length + sequence_lengths = pb_utils.get_input_tensor_by_name( + request, "SEQUENCE_LENGTH" + ).as_numpy() + + # Get cum log probs + cum_log_probs = pb_utils.get_input_tensor_by_name(request, "CUM_LOG_PROBS") + + # Get sequence length + output_log_probs = pb_utils.get_input_tensor_by_name( + request, "OUTPUT_LOG_PROBS" + ) + + # Get context logits + context_logits = pb_utils.get_input_tensor_by_name( + request, "CONTEXT_LOGITS" + ) + + # Get generation logits + generation_logits = pb_utils.get_input_tensor_by_name( + request, "GENERATION_LOGITS" + ) + + # Reshape Input + # tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]]) + # tokens_batch = tokens_batch.T + + # Postprocessing output data. + outputs = self._postprocessing(tokens_batch, sequence_lengths) + + # Create output tensors. You need pb_utils.Tensor + # objects to create pb_utils.InferenceResponse. + output_tensor = pb_utils.Tensor( + "OUTPUT", np.array(outputs).astype(self.output_dtype) + ) + + outputs = [] + outputs.append(output_tensor) + + if cum_log_probs: + out_cum_log_probs = pb_utils.Tensor( + "OUT_CUM_LOG_PROBS", cum_log_probs.as_numpy() + ) + outputs.append(out_cum_log_probs) + else: + out_cum_log_probs = pb_utils.Tensor( + "OUT_CUM_LOG_PROBS", np.array([[0.0]], dtype=np.float32) + ) + outputs.append(out_cum_log_probs) + + if output_log_probs: + out_output_log_probs = pb_utils.Tensor( + "OUT_OUTPUT_LOG_PROBS", output_log_probs.as_numpy() + ) + outputs.append(out_output_log_probs) + else: + out_output_log_probs = pb_utils.Tensor( + "OUT_OUTPUT_LOG_PROBS", np.array([[[0.0]]], dtype=np.float32) + ) + outputs.append(out_output_log_probs) + + if context_logits: + out_context_logits = pb_utils.Tensor( + "OUT_CONTEXT_LOGITS", context_logits.as_numpy() + ) + outputs.append(out_context_logits) + else: + out_context_logits = pb_utils.Tensor( + "OUT_CONTEXT_LOGITS", np.array([[[0.0]]], dtype=np.float32) + ) + outputs.append(out_context_logits) + + if generation_logits: + out_generation_logits = pb_utils.Tensor( + "OUT_GENERATION_LOGITS", generation_logits.as_numpy() + ) + outputs.append(out_generation_logits) + else: + out_generation_logits = pb_utils.Tensor( + "OUT_GENERATION_LOGITS", np.array([[[[0.0]]]], dtype=np.float32) + ) + outputs.append(out_generation_logits) + + # Create InferenceResponse. You can set an error here in case + # there was a problem with handling this inference request. + # Below is an example of how you can set errors in inference + # response: + # + # pb_utils.InferenceResponse( + # output_tensors=..., TritonError("An error occurred")) + inference_response = pb_utils.InferenceResponse(output_tensors=outputs) + responses.append(inference_response) + + # You should return a list of pb_utils.InferenceResponse. Length + # of this list must match the length of `requests` list. + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is optional. This function allows + the model to perform any necessary clean ups before exit. + """ + print("Cleaning up...") + + def _postprocessing(self, tokens_batch, sequence_lengths): + outputs = [] + for batch_idx, beam_tokens in enumerate(tokens_batch): + for beam_idx, tokens in enumerate(beam_tokens): + seq_len = sequence_lengths[batch_idx][beam_idx] + output = self.tokenizer.decode( + tokens[:seq_len], skip_special_tokens=self.skip_special_tokens + ) + outputs.append(output.encode("utf8")) + return outputs diff --git a/qa/L0_openai/example/src/tests/tensorrtllm_models/postprocessing/config.pbtxt b/qa/L0_openai/example/src/tests/tensorrtllm_models/postprocessing/config.pbtxt new file mode 100644 index 0000000000..dee851662d --- /dev/null +++ b/qa/L0_openai/example/src/tests/tensorrtllm_models/postprocessing/config.pbtxt @@ -0,0 +1,113 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "postprocessing" +backend: "python" +max_batch_size: 256 +input [ + { + name: "TOKENS_BATCH" + data_type: TYPE_INT32 + dims: [ -1, -1 ] + }, + { + name: "SEQUENCE_LENGTH" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "CUM_LOG_PROBS" + data_type: TYPE_FP32 + dims: [ -1 ] + optional: true + }, + { + name: "OUTPUT_LOG_PROBS" + data_type: TYPE_FP32 + dims: [ -1, -1 ] + optional: true + }, + { + name: "CONTEXT_LOGITS" + data_type: TYPE_FP32 + dims: [ -1, -1 ] + optional: true + }, + { + name: "GENERATION_LOGITS" + data_type: TYPE_FP32 + dims: [ -1, -1, -1 ] + optional: true + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_STRING + dims: [ -1 ] + }, + { + name: "OUT_CUM_LOG_PROBS" + data_type: TYPE_FP32 + dims: [ -1 ] + }, + { + name: "OUT_OUTPUT_LOG_PROBS" + data_type: TYPE_FP32 + dims: [ -1, -1 ] + }, + { + name: "OUT_CONTEXT_LOGITS" + data_type: TYPE_FP32 + dims: [ -1, -1 ] + }, + { + name: "OUT_GENERATION_LOGITS" + data_type: TYPE_FP32 + dims: [ -1, -1, -1 ] + } +] + +parameters { + key: "tokenizer_dir" + value: { + string_value: "/tmp/engines/llama-3-8b-instruct/hf_download" + } +} + +parameters { + key: "skip_special_tokens" + value: { + string_value: "${skip_special_tokens}" + } +} + +instance_group [ + { + count: 1 + kind: KIND_CPU + } +] diff --git a/qa/L0_openai/example/src/tests/tensorrtllm_models/preprocessing/1/model.py b/qa/L0_openai/example/src/tests/tensorrtllm_models/preprocessing/1/model.py new file mode 100644 index 0000000000..eb4487c803 --- /dev/null +++ b/qa/L0_openai/example/src/tests/tensorrtllm_models/preprocessing/1/model.py @@ -0,0 +1,418 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +from typing import List + +import numpy as np +import triton_python_backend_utils as pb_utils +from transformers import AutoTokenizer, T5Tokenizer + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + # Parse model configs + model_config = json.loads(args["model_config"]) + tokenizer_dir = model_config["parameters"]["tokenizer_dir"]["string_value"] + + add_special_tokens = model_config["parameters"].get("add_special_tokens") + if add_special_tokens is not None: + add_special_tokens_str = add_special_tokens["string_value"].lower() + if add_special_tokens_str in [ + "true", + "false", + "1", + "0", + "t", + "f", + "y", + "n", + "yes", + "no", + ]: + self.add_special_tokens = add_special_tokens_str in [ + "true", + "1", + "t", + "y", + "yes", + ] + else: + print( + f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens' correctly (set value is {add_special_tokens['string_value']}). Set it as True by default." + ) + self.add_special_tokens = True + else: + print( + f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens'. Set it as True by default." + ) + self.add_special_tokens = True + + self.tokenizer = AutoTokenizer.from_pretrained( + tokenizer_dir, legacy=False, padding_side="left", trust_remote_code=True + ) + if isinstance(self.tokenizer, T5Tokenizer): + self.tokenizer_bos_id = self.tokenizer.sp_model.bos_id() + + if not self.tokenizer.pad_token: + self.tokenizer.pad_token = self.tokenizer.eos_token + + self.tokenizer_end_id = self.tokenizer.encode( + self.tokenizer.eos_token, add_special_tokens=False + )[0] + self.tokenizer_pad_id = self.tokenizer.encode( + self.tokenizer.pad_token, add_special_tokens=False + )[0] + + # Parse model output configs and convert Triton types to numpy types + output_names = [ + "INPUT_ID", + "DECODER_INPUT_ID", + "REQUEST_INPUT_LEN", + "REQUEST_DECODER_INPUT_LEN", + "BAD_WORDS_IDS", + "STOP_WORDS_IDS", + "OUT_END_ID", + "OUT_PAD_ID", + ] + input_names = ["EMBEDDING_BIAS_WORDS", "EMBEDDING_BIAS_WEIGHTS"] + for input_name in input_names: + setattr( + self, + input_name.lower() + "_dtype", + pb_utils.triton_string_to_numpy( + pb_utils.get_input_config_by_name(model_config, input_name)[ + "data_type" + ] + ), + ) + + for output_name in output_names: + setattr( + self, + output_name.lower() + "_dtype", + pb_utils.triton_string_to_numpy( + pb_utils.get_output_config_by_name(model_config, output_name)[ + "data_type" + ] + ), + ) + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. Depending on the batching configuration (e.g. Dynamic + Batching) used, `requests` may contain multiple requests. Every + Python model, must create one pb_utils.InferenceResponse for every + pb_utils.InferenceRequest in `requests`. If there is an error, you can + set the error argument when creating a pb_utils.InferenceResponse. + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + + responses = [] + + # Every Python backend must iterate over everyone of the requests + # and create a pb_utils.InferenceResponse for each of them. + logger = pb_utils.Logger + for idx, request in enumerate(requests): + # Get input tensors + query = pb_utils.get_input_tensor_by_name(request, "QUERY").as_numpy() + decoder_query = pb_utils.get_input_tensor_by_name(request, "DECODER_QUERY") + if decoder_query is not None: + decoder_query = decoder_query.as_numpy() + + batch_dim = query.shape[0] + if batch_dim != 1: + err_str = ( + "Inflight batching backend expects requests with batch size of 1." + ) + logger.log_error(err_str) + responses.append( + pb_utils.InferenceResponse( + output_tensors=[], error=pb_utils.TritonError(err_str) + ) + ) + continue + + request_output_len = pb_utils.get_input_tensor_by_name( + request, "REQUEST_OUTPUT_LEN" + ).as_numpy() + + bad_words_dict = pb_utils.get_input_tensor_by_name( + request, "BAD_WORDS_DICT" + ) + if bad_words_dict is not None: + bad_words_dict = bad_words_dict.as_numpy() + + stop_words_dict = pb_utils.get_input_tensor_by_name( + request, "STOP_WORDS_DICT" + ) + if stop_words_dict is not None: + stop_words_dict = stop_words_dict.as_numpy() + + embedding_bias_words = pb_utils.get_input_tensor_by_name( + request, "EMBEDDING_BIAS_WORDS" + ) + if embedding_bias_words is not None: + embedding_bias_words = embedding_bias_words.as_numpy() + + embedding_bias_weights = pb_utils.get_input_tensor_by_name( + request, "EMBEDDING_BIAS_WEIGHTS" + ) + if embedding_bias_weights is not None: + embedding_bias_weights = embedding_bias_weights.as_numpy() + + # Take the end_id from the input tensors + # If not specified, use tokenizer to get end_id + end_id = pb_utils.get_input_tensor_by_name(request, "END_ID") + if end_id is not None: + end_id = end_id.as_numpy() + else: + end_id = [[self.tokenizer_end_id]] + + # Take the pad_id from the input tensors + # If not specified, use tokenizer to get pad_id + pad_id = pb_utils.get_input_tensor_by_name(request, "PAD_ID") + if pad_id is not None: + pad_id = pad_id.as_numpy() + else: + pad_id = [[self.tokenizer_pad_id]] + + # Preprocessing input data. + input_id, request_input_len = self._create_request(query) + if decoder_query is not None: + decoder_input_id, request_decoder_input_len = self._create_request( + decoder_query + ) + else: + decoder_input_id = pad_id * np.ones((1, 1), np.int32) + request_decoder_input_len = 1 * np.ones((1, 1), np.int32) + + bad_words = self._to_word_list_format(bad_words_dict) + stop_words = self._to_word_list_format(stop_words_dict) + + embedding_bias = self._get_embedding_bias( + embedding_bias_words, + embedding_bias_weights, + self.embedding_bias_weights_dtype, + ) + + # Create output tensors. You need pb_utils.Tensor + # objects to create pb_utils.InferenceResponse. + input_id_tensor = pb_utils.Tensor( + "INPUT_ID", input_id.astype(self.input_id_dtype) + ) + request_input_len_tensor = pb_utils.Tensor( + "REQUEST_INPUT_LEN", + request_input_len.astype(self.request_input_len_dtype), + ) + decoder_input_id_tensor = pb_utils.Tensor( + "DECODER_INPUT_ID", decoder_input_id.astype(self.decoder_input_id_dtype) + ) + request_decoder_input_len_tensor = pb_utils.Tensor( + "REQUEST_DECODER_INPUT_LEN", + request_decoder_input_len.astype(self.request_decoder_input_len_dtype), + ) + request_output_len_tensor = pb_utils.Tensor( + "REQUEST_OUTPUT_LEN", request_output_len + ) + bad_words_ids_tensor = pb_utils.Tensor("BAD_WORDS_IDS", bad_words) + stop_words_ids_tensor = pb_utils.Tensor("STOP_WORDS_IDS", stop_words) + embedding_bias_tensor = pb_utils.Tensor("EMBEDDING_BIAS", embedding_bias) + end_id_tensor = pb_utils.Tensor( + "OUT_END_ID", np.array(end_id, dtype=np.int32) + ) + pad_id_tensor = pb_utils.Tensor( + "OUT_PAD_ID", np.array(pad_id, dtype=np.int32) + ) + + inference_response = pb_utils.InferenceResponse( + output_tensors=[ + input_id_tensor, + decoder_input_id_tensor, + bad_words_ids_tensor, + stop_words_ids_tensor, + request_input_len_tensor, + request_decoder_input_len_tensor, + request_output_len_tensor, + embedding_bias_tensor, + end_id_tensor, + pad_id_tensor, + ] + ) + responses.append(inference_response) + + # You should return a list of pb_utils.InferenceResponse. Length + # of this list must match the length of `requests` list. + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is optional. This function allows + the model to perform any necessary clean ups before exit. + """ + print("Cleaning up...") + + def _create_request(self, query): + """ + query : batch string (2D numpy array) + """ + if isinstance(self.tokenizer, T5Tokenizer): + start_ids = [ + np.array( + [self.tokenizer_bos_id] + + self.tokenizer.encode( + s[0].decode(), add_special_tokens=self.add_special_tokens + ) + ).astype(int) + for s in query + ] + else: + start_ids = [ + np.array( + self.tokenizer.encode( + s[0].decode(), add_special_tokens=self.add_special_tokens + ) + ).astype(int) + for s in query + ] + start_lengths = np.array([[len(ids)] for ids in start_ids]).astype(int) + + max_len = 0 + for seq in start_ids: + max_len = max(max_len, seq.shape[0]) + start_ids = np.stack( + [ + np.pad( + seq, + (0, max_len - seq.shape[0]), + "constant", + constant_values=(0, self.tokenizer_pad_id), + ) + for seq in start_ids + ] + ) + + return start_ids, start_lengths + + def _to_word_list_format(self, word_lists: List[List[str | bytes]]): + """ + word_lists format: + len(word_lists) == batch_size + word_lists[i] means the words associated to batch item i. A "word" may actually be any string. Like "lorem" or "lorem ipsum". + """ + assert self.tokenizer != None, "need to set tokenizer" + + if word_lists is None: + # Return an empty array of shape (1,2,0) + return np.empty([1, 2, 0], dtype="int32") + + flat_ids = [] + offsets = [] + for word_list in word_lists: + item_flat_ids = [] + item_offsets = [] + + for word in word_list: + if isinstance(word, bytes): + word = word.decode() + + ids = self.tokenizer.encode(word, add_special_tokens=False) + if len(ids) == 0: + continue + + item_flat_ids += ids + item_offsets.append(len(ids)) + + flat_ids.append(np.array(item_flat_ids)) + offsets.append(np.cumsum(np.array(item_offsets))) + + pad_to = max(1, max(len(ids) for ids in flat_ids)) + + for i, (ids, offs) in enumerate(zip(flat_ids, offsets)): + flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)), constant_values=0) + offsets[i] = np.pad(offs, (0, pad_to - len(offs)), constant_values=-1) + + return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2)) + + def _get_embedding_bias( + self, embedding_bias_words, embedding_bias_weights, bias_dtype + ): + assert self.tokenizer != None, "need to set tokenizer" + + if embedding_bias_words is None or embedding_bias_weights is None: + return np.empty([1, 0], dtype=self.embedding_bias_weights_dtype) + + batch_embedding_bias = [] + for words, weights in zip(embedding_bias_words, embedding_bias_weights): + vocab_size = self.tokenizer.vocab_size + embedding_bias = [0.0] * vocab_size + + assert len(words) == len( + weights + ), "Embedding bias words must have same dimension as embedding bias weights" + + for word, weight in zip(words, weights): + if isinstance(word, bytes): + word = word.decode() + ids = self.tokenizer.encode(word) + + if len(ids) == 0: + continue + + for id in ids: + embedding_bias[id] += weight + + batch_embedding_bias.append(np.array(embedding_bias)) + + return np.array(batch_embedding_bias, dtype=bias_dtype) diff --git a/qa/L0_openai/example/src/tests/tensorrtllm_models/preprocessing/config.pbtxt b/qa/L0_openai/example/src/tests/tensorrtllm_models/preprocessing/config.pbtxt new file mode 100644 index 0000000000..a262cf6983 --- /dev/null +++ b/qa/L0_openai/example/src/tests/tensorrtllm_models/preprocessing/config.pbtxt @@ -0,0 +1,156 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "preprocessing" +backend: "python" +max_batch_size: 256 +input [ + { + name: "QUERY" + data_type: TYPE_STRING + dims: [ -1 ] + }, + { + name: "DECODER_QUERY" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true + }, + { + name: "REQUEST_OUTPUT_LEN" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "BAD_WORDS_DICT" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true + }, + { + name: "STOP_WORDS_DICT" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true + }, + { + name: "EMBEDDING_BIAS_WORDS" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true + }, + { + name: "EMBEDDING_BIAS_WEIGHTS" + data_type: TYPE_FP32 + dims: [ -1 ] + optional: true + }, + { + name: "END_ID" + data_type: TYPE_INT32 + dims: [ -1 ] + optional: true + }, + { + name: "PAD_ID" + data_type: TYPE_INT32 + dims: [ -1 ] + optional: true + } +] +output [ + { + name: "INPUT_ID" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "REQUEST_INPUT_LEN" + data_type: TYPE_INT32 + dims: [ 1 ] + }, + { + name: "DECODER_INPUT_ID" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "REQUEST_DECODER_INPUT_LEN" + data_type: TYPE_INT32 + dims: [ 1 ] + }, + { + name: "BAD_WORDS_IDS" + data_type: TYPE_INT32 + dims: [ 2, -1 ] + }, + { + name: "STOP_WORDS_IDS" + data_type: TYPE_INT32 + dims: [ 2, -1 ] + }, + { + name: "EMBEDDING_BIAS" + data_type: TYPE_FP32 + dims: [ -1 ] + }, + { + name: "REQUEST_OUTPUT_LEN" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "OUT_END_ID" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "OUT_PAD_ID" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] + +parameters { + key: "tokenizer_dir" + value: { + string_value: "/tmp/engines/llama-3-8b-instruct/hf_download" + } +} + +parameters { + key: "add_special_tokens" + value: { + string_value: "${add_special_tokens}" + } +} + +instance_group [ + { + count: 1 + kind: KIND_CPU + } +] diff --git a/qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm/1/.gitkeep b/qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm/1/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm/1/model.py b/qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm/1/model.py new file mode 100644 index 0000000000..3425a20f57 --- /dev/null +++ b/qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm/1/model.py @@ -0,0 +1,797 @@ +import datetime +import json +import os +import time +from threading import Lock, Thread + +import numpy as np +import tensorrt_llm.bindings.executor as trtllm +import triton_python_backend_utils as pb_utils +from torch import from_numpy + + +def get_input_tensor_by_name(request, name): + tensor = pb_utils.get_input_tensor_by_name(request, name) + if tensor is None: + if name == "temperature": + print(f"Tensor for {name} is None!") + return None + return tensor.as_numpy() + + +def get_input_scalar_by_name(request, name): + tensor = get_input_tensor_by_name(request, name) + if tensor is None: + if name == "temperature": + print(f"Scalar for {name} is None!") + return None + if tensor.size != 1: + raise pb_utils.TritonModelException(f"Expected a single value for {name}") + return tensor.item() + + +def read_parameter_as_type(value, name, pytype=str): + if value == "": + return None + if value.startswith("${") and value.endswith("}"): + return None + if pytype is bool: + return value.lower() in ["1", "true"] + try: + result = pytype(value) + return result + except: + pb_utils.Logger.log_warning( + f"Could not read parameter '{name}' with value '{value}', will use default." + ) + return None + + +def get_parameter(model_config, name, pytype=str): + if name not in model_config["parameters"]: + return None + return read_parameter_as_type( + model_config["parameters"][name]["string_value"], name, pytype + ) + + +def convert_word_list(word_list): + if word_list is None: + return None + word_list = word_list.tolist() + if len(word_list) == 0 or len(word_list[0]) != 2: + raise pb_utils.TritonModelException(f"Invalid format for word list.") + words, indices = word_list[0] + result = [] + current_index = 0 + for i in indices: + if i == -1: + continue + if i > len(words): + raise pb_utils.TritonModelException(f"Invalid format for word list.") + current_word = [] + while current_index < i: + current_word.append(words[current_index]) + current_index += 1 + result.append(current_word) + return result + + +def parse_medusa_choices(medusa_choices): + if medusa_choices is None: + return None + try: + result = json.loads( + "[" + medusa_choices.replace("{", "[").replace("}", "]") + "]" + ) + assert isinstance(result, list) and len(result) > 0 + assert all([isinstance(x, list) for x in result]) + assert all([isinstance(y, int) for x in result for y in x]) + except Exception: + raise pb_utils.TritonModelException("Invalid format for medusa_choices") + return result + + +def get_sampling_config_from_request(request): + kwargs = {} + kwargs["beam_width"] = get_input_scalar_by_name(request, "beam_width") or 1 + kwargs["top_k"] = get_input_scalar_by_name(request, "runtime_top_k") + kwargs["top_p"] = get_input_scalar_by_name(request, "runtime_top_p") + kwargs["top_p"] = ( + None if kwargs["top_p"] is None or kwargs["top_p"] <= 0 else kwargs["top_p"] + ) + kwargs["random_seed"] = get_input_scalar_by_name(request, "random_seed") + kwargs["temperature"] = get_input_scalar_by_name(request, "temperature") + # print(f"=========== [DEBUG] [trtllm python runtime model.py] {kwargs['temperature']=} ==========") + kwargs["min_length"] = get_input_scalar_by_name(request, "min_length") + kwargs["repetition_penalty"] = get_input_scalar_by_name( + request, "repetition_penalty" + ) + kwargs["presence_penalty"] = get_input_scalar_by_name(request, "presence_penalty") + kwargs["frequency_penalty"] = get_input_scalar_by_name(request, "frequency_penalty") + kwargs["length_penalty"] = get_input_scalar_by_name(request, "len_penalty") + kwargs["top_p_min"] = get_input_scalar_by_name(request, "runtime_top_p_min") + kwargs["top_p_reset_ids"] = get_input_scalar_by_name( + request, "runtime_top_p_reset_ids" + ) + kwargs["top_p_decay"] = get_input_scalar_by_name(request, "runtime_top_p_decay") + kwargs["beam_search_diversity_rate"] = get_input_scalar_by_name( + request, "beam_search_diversity_rate" + ) + kwargs["early_stopping"] = get_input_scalar_by_name(request, "early_stopping") + kwargs = {k: v for k, v in kwargs.items() if v is not None} + return trtllm.SamplingConfig(**kwargs) + + +def get_output_config_from_request(request, exclude_input_from_output): + kwargs = {} + kwargs["return_log_probs"] = get_input_scalar_by_name(request, "return_log_probs") + kwargs["return_context_logits"] = get_input_scalar_by_name( + request, "return_context_logits" + ) + kwargs["return_generation_logits"] = get_input_scalar_by_name( + request, "return_generation_logits" + ) + kwargs["exclude_input_from_output"] = exclude_input_from_output + kwargs = {k: v for k, v in kwargs.items() if v is not None} + return trtllm.OutputConfig(**kwargs) + + +def get_external_draft_tokens_config_from_request(request): + kwargs = {} + draft_input_ids = get_input_tensor_by_name(request, "draft_input_ids") + if draft_input_ids is not None: + kwargs["tokens"] = draft_input_ids.tolist() + draft_logits = get_input_tensor_by_name(request, "draft_logits") + if draft_logits is not None: + kwargs["logits"] = from_numpy(draft_logits) + kwargs["acceptance_threshold"] = get_input_scalar_by_name( + request, "draft_acceptance_threshold" + ) + kwargs = {k: v for k, v in kwargs.items() if v is not None} + if len(kwargs) > 0: + return trtllm.ExternalDraftTokensConfig(**kwargs) + return None + + +def get_prompt_tuning_config_from_request(request): + # prompt_vocab_size is unused by executor. + kwargs = {} + prompt_embedding_table = get_input_tensor_by_name(request, "prompt_embedding_table") + if prompt_embedding_table is not None: + kwargs["embedding_table"] = from_numpy(prompt_embedding_table) + kwargs = {k: v for k, v in kwargs.items() if v is not None} + if len(kwargs) > 0: + return trtllm.PromptTuningConfig(**kwargs) + return None + + +def get_lora_config_from_request(request): + kwargs = {} + kwargs["task_id"] = get_input_scalar_by_name(request, "lora_task_id") + lora_weights = get_input_tensor_by_name(request, "lora_weights") + if lora_weights is not None: + kwargs["weights"] = from_numpy(lora_weights) + lora_config = get_input_tensor_by_name(request, "lora_config") + if lora_config is not None: + kwargs["config"] = from_numpy(lora_config) + kwargs = {k: v for k, v in kwargs.items() if v is not None} + if len(kwargs) > 0: + return trtllm.LoraConfig(**kwargs) + return None + + +def convert_request(request, exclude_input_from_output, decoupled): + inputs = {} + input_token_ids = get_input_tensor_by_name(request, "input_ids") + if input_token_ids is None: + raise pb_utils.TritonModelException("A value is required for input_ids") + input_token_ids = input_token_ids.tolist() + if len(input_token_ids) == 0: + raise pb_utils.TritonModelException(f"Invalid format for input_ids") + inputs["input_token_ids"] = input_token_ids[0] + # input_lengths is not not used by executor. + inputs["max_new_tokens"] = get_input_scalar_by_name(request, "request_output_len") + if inputs["max_new_tokens"] is None: + raise pb_utils.TritonModelException( + "A value is required for request_output_len" + ) + inputs["streaming"] = get_input_scalar_by_name(request, "streaming") + if inputs["streaming"] and not decoupled: + raise pb_utils.TritonModelException( + "Streaming is only supported in decoupled mode." + ) + inputs["end_id"] = get_input_scalar_by_name(request, "end_id") + inputs["pad_id"] = get_input_scalar_by_name(request, "pad_id") + inputs["stop_words"] = convert_word_list( + get_input_tensor_by_name(request, "stop_words_list") + ) + inputs["bad_words"] = convert_word_list( + get_input_tensor_by_name(request, "bad_words_list") + ) + embedding_bias = get_input_tensor_by_name(request, "embedding_bias") + if embedding_bias is not None and embedding_bias.size != 0: + inputs["embedding_bias"] = from_numpy(embedding_bias).squeeze() + + sampling_config = get_sampling_config_from_request(request) + output_config = get_output_config_from_request(request, exclude_input_from_output) + external_draft_tokens_config = get_external_draft_tokens_config_from_request( + request + ) + prompt_tuning_config = get_prompt_tuning_config_from_request(request) + lora_config = get_lora_config_from_request(request) + + return trtllm.Request( + **inputs, + sampling_config=sampling_config, + output_config=output_config, + external_draft_tokens_config=external_draft_tokens_config, + prompt_tuning_config=prompt_tuning_config, + lora_config=lora_config, + ) + + +def convert_response(response): + if response.has_error(): + return ( + pb_utils.InferenceResponse( + output_tensors=[], error=pb_utils.TritonError(response.error_msg) + ), + True, + ) + result = response.result + beam_lengths = np.expand_dims( + np.array([len(beam) for beam in result.output_token_ids], np.int32), 0 + ) + max_beam_length = max([len(beam) for beam in result.output_token_ids]) + output_ids = np.full( + (1, len(result.output_token_ids), max_beam_length), -1, np.int32 + ) + for idx, beam in enumerate(result.output_token_ids): + output_ids[0, idx, : len(beam)] = beam + output_tensors = [ + pb_utils.Tensor("output_ids", output_ids), + pb_utils.Tensor("sequence_length", beam_lengths), + ] + output_tensors.append( + pb_utils.Tensor( + "cum_log_probs", + np.expand_dims(np.array(result.cum_log_probs, np.float32), 0) + if result.cum_log_probs is not None + else np.zeros((1, 1), np.float32), + ) + ) + output_tensors.append( + pb_utils.Tensor( + "output_log_probs", + np.expand_dims(np.array(result.log_probs, np.float32), 0) + if result.log_probs is not None + else np.zeros((1, 1, 1), np.float32), + ) + ) + output_tensors.append( + pb_utils.Tensor( + "context_logits", + np.expand_dims(np.array(result.context_logits, np.float32), 0) + if result.context_logits is not None + else np.zeros((1, 1, 1), np.float32), + ) + ) + output_tensors.append( + pb_utils.Tensor( + "generation_logits", + np.expand_dims(np.array(result.generation_logits, np.float32), 0) + if result.generation_logits is not None + else np.zeros((1, 1, 1, 1), np.float32), + ) + ) + return pb_utils.InferenceResponse(output_tensors), result.is_final + + +def convert_scheduler_policy(batch_scheduler_policy: str): + if batch_scheduler_policy.lower() == "max_utilization": + return trtllm.CapacitySchedulerPolicy.MAX_UTILIZATION + elif batch_scheduler_policy.lower() == "guaranteed_no_evict": + return trtllm.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT + raise pb_utils.TritonModelException( + f"batch_scheduler_policy value of '{batch_scheduler_policy}' is not supported." + ) + + +def convert_batching_type(gpt_model_type: str): + if gpt_model_type is None: + return None + if ( + gpt_model_type.lower() == "inflight_fused_batching" + or gpt_model_type.lower() == "inflight_batching" + ): + return trtllm.BatchingType.INFLIGHT + elif gpt_model_type.lower() == "v1": + return trtllm.BatchingType.STATIC + raise pb_utils.TritonModelException( + f"gpt_model_type value of '{gpt_model_type}' is not supported." + ) + + +def convert_decoding_mode(decoding_mode: str): + if decoding_mode is None: + return None + elif decoding_mode == "auto": + return trtllm.DecodingMode.Auto() + elif decoding_mode == "top_k": + return trtllm.DecodingMode.TopK() + elif decoding_mode == "top_p": + return trtllm.DecodingMode.TopP() + elif decoding_mode == "top_k_top_p": + return trtllm.DecodingMode.TopKTopP() + elif decoding_mode == "beam_search": + return trtllm.DecodingMode.BeamSearch() + elif decoding_mode == "medusa": + return trtllm.DecodingMode.Medusa() + raise pb_utils.TritonModelException( + f"decoding_mode value of '{decoding_mode}' is not supported." + ) + + +def convert_timestamp_to_seconds(timestamp: str): + return int(datetime.datetime.strptime(timestamp, "%m-%d-%Y %H:%M:%S").timestamp()) + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def get_scheduler_config(self, model_config): + batch_scheduler_policy = get_parameter(model_config, "batch_scheduler_policy") + if batch_scheduler_policy is None: + return trtllm.SchedulerConfig() + return trtllm.SchedulerConfig(convert_scheduler_policy(batch_scheduler_policy)) + + def get_kv_cache_config(self, model_config): + kwargs = { + "enable_block_reuse": get_parameter( + model_config, "enable_kv_cache_reuse", bool + ), + "max_tokens": get_parameter( + model_config, "max_tokens_in_paged_kv_cache", int + ), + "sink_token_length": get_parameter(model_config, "sink_token_length", int), + "max_attention_window": get_parameter( + model_config, "max_attention_window_size", int + ), + "free_gpu_memory_fraction": get_parameter( + model_config, "kv_cache_free_gpu_mem_fraction", float + ), + "host_cache_size": get_parameter( + model_config, "kv_cache_host_memory_bytes", int + ), + "onboard_blocks": get_parameter( + model_config, "kv_cache_onboard_blocks", bool + ), + } + kwargs = {k: v for k, v in kwargs.items() if v is not None} + return trtllm.KvCacheConfig(**kwargs) + + def get_parallel_config(self, model_config): + kwargs = {} + gpu_device_ids = get_parameter(model_config, "gpu_device_ids") + if gpu_device_ids: + kwargs["device_ids"] = [int(x) for x in gpu_device_ids.split(",")] + self.use_orchestrator_mode = os.environ.get("TRTLLM_ORCHESTRATOR", "0") == "1" + if self.use_orchestrator_mode: + kwargs["communication_mode"] = trtllm.CommunicationMode.ORCHESTRATOR + worker_path = get_parameter(model_config, "worker_path") + if worker_path is not None: + raise pb_utils.TritonModelException( + "worker_path parameter is specified, but this is no longer supported. Please specify executor_worker_path instead to specify the location of the trtllmExecutorWorker executable." + ) + executor_worker_path = get_parameter(model_config, "executor_worker_path") + kwargs["orchestrator_config"] = trtllm.OrchestratorConfig( + True, executor_worker_path + ) + if len(kwargs) > 0: + return trtllm.ParallelConfig(**kwargs) + return None + + def get_peft_cache_config(self, model_config): + kwargs = { + "optimal_adapter_size": get_parameter( + model_config, "lora_cache_optimal_adapter_size", int + ), + "max_adapter_size": get_parameter( + model_config, "lora_cache_max_adapter_size", int + ), + "device_cache_percent": get_parameter( + model_config, "lora_cache_gpu_memory_fraction", float + ), + "host_cache_size": get_parameter( + model_config, "lora_cache_host_memory_bytes", int + ), + } + kwargs = {k: v for k, v in kwargs.items() if v is not None} + return trtllm.PeftCacheConfig(**kwargs) + + def get_decoding_config(self, model_config): + kwargs = { + "medusa_choices": parse_medusa_choices( + get_parameter(model_config, "medusa_choices") + ), + "decoding_mode": convert_decoding_mode( + get_parameter(model_config, "decoding_mode") + ), + } + print(kwargs) + kwargs = {k: v for k, v in kwargs.items() if v is not None} + return trtllm.DecodingConfig(**kwargs) + + def get_executor_config(self, model_config): + kwargs = { + "max_beam_width": get_parameter(model_config, "max_beam_width", int), + "scheduler_config": self.get_scheduler_config(model_config), + "kv_cache_config": self.get_kv_cache_config(model_config), + "enable_chunked_context": get_parameter( + model_config, "enable_chunked_context", bool + ), + "normalize_log_probs": get_parameter( + model_config, "normalize_log_probs", bool + ), + "batching_type": convert_batching_type( + get_parameter(model_config, "gpt_model_type") + ), + "parallel_config": self.get_parallel_config(model_config), + "peft_cache_config": self.get_peft_cache_config(model_config), + "decoding_config": self.get_decoding_config(model_config), + } + kwargs = {k: v for k, v in kwargs.items() if v is not None} + return trtllm.ExecutorConfig(**kwargs) + + def create_metrics(self, model: str, version: str, is_v1_model: bool): + self.request_metric_family = pb_utils.MetricFamily( + name="nv_trt_llm_request_metrics", + description="TRT LLM request metrics", + kind=pb_utils.MetricFamily.GAUGE, + ) + self.runtime_memory_metric_family = pb_utils.MetricFamily( + name="nv_trt_llm_runtime_memory_metrics", + description="TRT LLM runtime memory metrics", + kind=pb_utils.MetricFamily.GAUGE, + ) + self.kv_cache_metric_family = pb_utils.MetricFamily( + name="nv_trt_llm_kv_cache_block_metrics", + description="TRT LLM KV cache block metrics", + kind=pb_utils.MetricFamily.GAUGE, + ) + model_type = "v1" if is_v1_model else "inflight_batcher" + self.model_type_metric_family = pb_utils.MetricFamily( + name=f"nv_trt_llm_{model_type}_metrics", + description=f"TRT LLM {model_type}-specific metrics", + kind=pb_utils.MetricFamily.GAUGE, + ) + self.general_metric_family = pb_utils.MetricFamily( + name="nv_trt_llm_general_metrics", + description="General TRT LLM metrics", + kind=pb_utils.MetricFamily.GAUGE, + ) + common_labels = {"model": model, "version": version} + self.all_metrics = { + # Request metrics + "num_active_requests": self.request_metric_family.Metric( + labels={"request_type": "active", **common_labels} + ), + "max_num_active_requests": self.request_metric_family.Metric( + labels={"request_type": "max", **common_labels} + ), + "num_scheduled_requests": self.request_metric_family.Metric( + labels={"request_type": "scheduled", **common_labels} + ), + "num_context_requests": self.request_metric_family.Metric( + labels={"request_type": "context", **common_labels} + ), + # Runtime metrics + "cpu_mem_usage": self.runtime_memory_metric_family.Metric( + labels={"memory_type": "cpu", **common_labels} + ), + "gpu_mem_usage": self.runtime_memory_metric_family.Metric( + labels={"memory_type": "gpu", **common_labels} + ), + "pinned_mem_usage": self.runtime_memory_metric_family.Metric( + labels={"memory_type": "pinned", **common_labels} + ), + # KV cache metrics + "max_num_blocks": self.kv_cache_metric_family.Metric( + labels={"kv_cache_block_type": "max", **common_labels} + ), + "free_num_blocks": self.kv_cache_metric_family.Metric( + labels={"kv_cache_block_type": "free", **common_labels} + ), + "used_num_blocks": self.kv_cache_metric_family.Metric( + labels={"kv_cache_block_type": "used", **common_labels} + ), + "tokens_per_block": self.kv_cache_metric_family.Metric( + labels={"kv_cache_block_type": "tokens_per", **common_labels} + ), + # General metrics + "timestamp": self.general_metric_family.Metric( + labels={"general_type": "timestamp", **common_labels} + ), + "iter": self.general_metric_family.Metric( + labels={"general_type": "iteration_counter", **common_labels} + ), + } + if is_v1_model: + self.all_metrics.update( + { + "num_ctx_tokens": self.model_type_metric_family.Metric( + labels={ + "v1_specific_metric": "total_context_tokens", + **common_labels, + } + ), + "num_gen_tokens": self.model_type_metric_family.Metric( + labels={ + "v1_specific_metric": "total_generation_tokens", + **common_labels, + } + ), + "empty_gen_slots": self.model_type_metric_family.Metric( + labels={ + "v1_specific_metric": "empty_generation_slots", + **common_labels, + } + ), + } + ) + else: + self.all_metrics.update( + { + "num_ctx_tokens": self.model_type_metric_family.Metric( + labels={ + "inflight_batcher_specific_metric": "total_context_tokens", + **common_labels, + } + ), + "num_gen_requests": self.model_type_metric_family.Metric( + labels={ + "inflight_batcher_specific_metric": "generation_requests", + **common_labels, + } + ), + "micro_batch_id": self.model_type_metric_family.Metric( + labels={ + "inflight_batcher_specific_metric": "micro_batch_id", + **common_labels, + } + ), + "num_paused_requests": self.model_type_metric_family.Metric( + labels={ + "inflight_batcher_specific_metric": "paused_requests", + **common_labels, + } + ), + } + ) + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + model_config = json.loads(args["model_config"]) + gpt_model_path = get_parameter(model_config, "gpt_model_path") + if get_parameter(model_config, "enable_trt_overlap", bool): + raise pb_utils.TritonModelException( + f"enable_trt_overlap=true is not supported." + ) + self.exclude_input_from_output = get_parameter( + model_config, "exclude_input_in_output", bool + ) + executor_config = self.get_executor_config(model_config) + self.executor = trtllm.Executor( + gpt_model_path, trtllm.ModelType.DECODER_ONLY, executor_config + ) + self.decoupled = pb_utils.using_decoupled_model_transaction_policy(model_config) + self.cancellation_check_period_ms = ( + get_parameter(model_config, "cancellation_check_period_ms", int) or 100 + ) + self.stats_check_period_ms = ( + get_parameter(model_config, "stats_check_period_ms", int) or 100 + ) + + if not self.decoupled: + raise pb_utils.TritonModelException( + "Please enable decoupled transaction policy in the model configuration to serve this model" + ) + + self.create_metrics( + args["model_name"], + args["model_version"], + is_v1_model=executor_config.batching_type == trtllm.BatchingType.STATIC, + ) + self.triton_id_to_req_id = {} + self.req_id_to_response_sender = {} + self.lock = Lock() + self.running = False + self.awaiter_thread = Thread(target=self.awaiter_loop) + self.cancellation_thread = Thread(target=self.cancellation_loop) + self.metrics_thread = Thread(target=self.metrics_loop) + if self.executor.can_enqueue_requests(): + self.running = True + self.awaiter_thread.start() + self.cancellation_thread.start() + self.metrics_thread.start() + else: + # In leader mode, worker ranks will wait here until leader is done. + self.executor.shutdown() + + def handle_stop_request(self, triton_id, response_sender): + if triton_id is None or triton_id == "": + response_sender.send( + pb_utils.InferenceResponse( + error=pb_utils.TritonError( + "A request id must be provided for request cancellation" + ) + ), + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL, + ) + return + + if triton_id in self.triton_id_to_req_id: + req_id = self.triton_id_to_req_id[triton_id] + self.executor.cancel_request(req_id) + + response_sender.send( + pb_utils.InferenceResponse(), + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL, + ) + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. + + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + if not self.executor.can_enqueue_requests(): + return + + # Convert to executor requests. + triton_requests = [] + executor_requests = [] + for request in requests: + response_sender = request.get_response_sender() + if get_input_scalar_by_name(request, "stop"): + self.handle_stop_request(request.request_id(), response_sender) + else: + try: + converted = convert_request( + request, self.exclude_input_from_output, self.decoupled + ) + except Exception as e: + response_sender.send( + pb_utils.InferenceResponse( + error=pb_utils.TritonError( + f"An error occurred when processing the input values for request id {request.request_id()}, the error was '{e}'" + ) + ), + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL, + ) + else: + triton_requests.append(request) + executor_requests.append(converted) + + with self.lock: + request_ids = self.executor.enqueue_requests(executor_requests) + for req_id, request in zip(request_ids, triton_requests): + triton_id = request.request_id() + self.req_id_to_response_sender[req_id] = ( + triton_id, + request.get_response_sender(), + ) + self.triton_id_to_req_id[triton_id] = req_id + return None + + def awaiter_loop(self): + """Gets responses from executor and returns the results.""" + while self.running: + for response in self.executor.await_responses( + timeout=datetime.timedelta(milliseconds=1) + ): + req_id = response.request_id + with self.lock: + if req_id not in self.req_id_to_response_sender: + continue + triton_id, response_sender = self.req_id_to_response_sender[req_id] + + triton_response, is_final = convert_response(response) + response_sender.send( + triton_response, + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + if is_final + else 0, + ) + + if is_final: + with self.lock: + del self.triton_id_to_req_id[triton_id] + del self.req_id_to_response_sender[req_id] + # Remove local reference so response_sender can be cleaned properly. + del response_sender + + def cancellation_loop(self): + """Checks if any pending requests have been cancelled.""" + while self.running: + time.sleep(self.cancellation_check_period_ms / 1000.0) + with self.lock: + for req_id, ( + triton_id, + response_sender, + ) in self.req_id_to_response_sender.items(): + if response_sender.is_cancelled(): + self.executor.cancel_request(req_id) + # Remove local reference so response_sender can be cleaned properly. + del response_sender + + def metrics_loop(self): + """Updates triton metrics using stats from the executor.""" + while self.running: + time.sleep(self.stats_check_period_ms / 1000.0) + for stat in self.executor.get_latest_iteration_stats(): + try: + for key, metric in self.all_metrics.items(): + value = None + if hasattr(stat, key): + value = getattr(stat, key) + elif stat.kv_cache_stats is not None and hasattr( + stat.kv_cache_stats, key + ): + value = getattr(stat.kv_cache_stats, key) + elif stat.static_batching_stats is not None and hasattr( + stat.static_batching_stats, key + ): + value = getattr(stat.static_batching_stats, key) + elif stat.inflight_batching_stats is not None and hasattr( + stat.inflight_batching_stats, key + ): + value = getattr(stat.inflight_batching_stats, key) + if value is not None: + if key == "timestamp": + value = convert_timestamp_to_seconds(value) + metric.set(value) + else: + pb_utils.Logger.log_warn(f'Metric "{key}" not found.') + except Exception as e: + pb_utils.Logger.log_warn(f"Error while processing metrics: {e}") + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is optional. This function allows + the model to perform any necessary clean ups before exit. + """ + if self.executor.can_enqueue_requests(): + self.running = False + self.awaiter_thread.join() + self.cancellation_thread.join() + self.metrics_thread.join() + self.executor.shutdown() diff --git a/qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm/config.pbtxt b/qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm/config.pbtxt new file mode 100644 index 0000000000..7c9f294b89 --- /dev/null +++ b/qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm/config.pbtxt @@ -0,0 +1,542 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "tensorrt_llm" +backend: "tensorrtllm" +#backend: "python" +max_batch_size: 256 + +model_transaction_policy { + decoupled: True +} + +dynamic_batching { + preferred_batch_size: [ 256 ] + max_queue_delay_microseconds: 1000 +} + +input [ + { + name: "input_ids" + data_type: TYPE_INT32 + dims: [ -1 ] + allow_ragged_batch: true + }, + { + name: "input_lengths" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + }, + { + name: "request_output_len" + data_type: TYPE_INT32 + dims: [ 1 ] + }, + { + name: "draft_input_ids" + data_type: TYPE_INT32 + dims: [ -1 ] + optional: true + allow_ragged_batch: true + }, + { + name: "decoder_input_ids" + data_type: TYPE_INT32 + dims: [ -1 ] + optional: true + allow_ragged_batch: true + }, + { + name: "decoder_input_lengths" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + reshape: { shape: [ ] } + }, + { + name: "draft_logits" + data_type: TYPE_FP32 + dims: [ -1, -1 ] + optional: true + allow_ragged_batch: true + }, + { + name: "draft_acceptance_threshold" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "end_id" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "pad_id" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "stop_words_list" + data_type: TYPE_INT32 + dims: [ 2, -1 ] + optional: true + allow_ragged_batch: true + }, + { + name: "bad_words_list" + data_type: TYPE_INT32 + dims: [ 2, -1 ] + optional: true + allow_ragged_batch: true + }, + { + name: "embedding_bias" + data_type: TYPE_FP32 + dims: [ -1 ] + optional: true + allow_ragged_batch: true + }, + { + name: "beam_width" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "temperature" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_k" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_p" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_p_min" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_p_decay" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_p_reset_ids" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "len_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "early_stopping" + data_type: TYPE_BOOL + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "repetition_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "min_length" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "beam_search_diversity_rate" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "presence_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "frequency_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "random_seed" + data_type: TYPE_UINT64 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "return_log_probs" + data_type: TYPE_BOOL + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "return_context_logits" + data_type: TYPE_BOOL + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "return_generation_logits" + data_type: TYPE_BOOL + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "stop" + data_type: TYPE_BOOL + dims: [ 1 ] + optional: true + }, + { + name: "streaming" + data_type: TYPE_BOOL + dims: [ 1 ] + optional: true + }, + { + name: "prompt_embedding_table" + data_type: TYPE_FP16 + dims: [ -1, -1 ] + optional: true + allow_ragged_batch: true + }, + { + name: "prompt_vocab_size" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + # the unique task ID for the given LoRA. + # To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given. + # The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`. + # If the cache is full the oldest LoRA will be evicted to make space for new ones. An error is returned if `lora_task_id` is not cached. + { + name: "lora_task_id" + data_type: TYPE_UINT64 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + # weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ] + # where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer + # each of the in / out tensors are first flattened and then concatenated together in the format above. + # D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out. + { + name: "lora_weights" + data_type: TYPE_FP16 + dims: [ -1, -1 ] + optional: true + allow_ragged_batch: true + }, + # module identifier (same size a first dimension of lora_weights) + # See LoraModule::ModuleType for model id mapping + # + # "attn_qkv": 0 # compbined qkv adapter + # "attn_q": 1 # q adapter + # "attn_k": 2 # k adapter + # "attn_v": 3 # v adapter + # "attn_dense": 4 # adapter for the dense layer in attention + # "mlp_h_to_4h": 5 # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection + # "mlp_4h_to_h": 6 # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection + # "mlp_gate": 7 # for llama2 adapter for gated mlp later after attention / RMSNorm: gate + # + # last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ] + { + name: "lora_config" + data_type: TYPE_INT32 + dims: [ -1, 3 ] + optional: true + allow_ragged_batch: true + } +] +output [ + { + name: "output_ids" + data_type: TYPE_INT32 + dims: [ -1, -1 ] + }, + { + name: "sequence_length" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "cum_log_probs" + data_type: TYPE_FP32 + dims: [ -1 ] + }, + { + name: "output_log_probs" + data_type: TYPE_FP32 + dims: [ -1, -1 ] + }, + { + name: "context_logits" + data_type: TYPE_FP32 + dims: [ -1, -1 ] + }, + { + name: "generation_logits" + data_type: TYPE_FP32 + dims: [ -1, -1, -1 ] + } +] +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] +parameters: { + key: "max_beam_width" + value: { + string_value: "${max_beam_width}" + } +} +parameters: { + key: "FORCE_CPU_ONLY_INPUT_TENSORS" + value: { + string_value: "no" + } +} +parameters: { + key: "gpt_model_type" + value: { + string_value: "inflight_fused_batching" + } +} +parameters: { + key: "gpt_model_path" + value: { + string_value: "/tmp/engines/llama-3-8b-instruct" + } +} +parameters: { + key: "encoder_model_path" + value: { + string_value: "${encoder_engine_dir}" + } +} +parameters: { + key: "max_tokens_in_paged_kv_cache" + value: { + string_value: "${max_tokens_in_paged_kv_cache}" + } +} +parameters: { + key: "max_attention_window_size" + value: { + string_value: "${max_attention_window_size}" + } +} +parameters: { + key: "sink_token_length" + value: { + string_value: "${sink_token_length}" + } +} +parameters: { + key: "batch_scheduler_policy" + value: { + string_value: "${batch_scheduler_policy}" + } +} +parameters: { + key: "kv_cache_free_gpu_mem_fraction" + value: { + string_value: "${kv_cache_free_gpu_mem_fraction}" + } +} +parameters: { + key: "kv_cache_host_memory_bytes" + value: { + string_value: "${kv_cache_host_memory_bytes}" + } +} +parameters: { + key: "kv_cache_onboard_blocks" + value: { + string_value: "${kv_cache_onboard_blocks}" + } +} +# enable_trt_overlap is deprecated and doesn't have any effect on the runtime +# parameters: { +# key: "enable_trt_overlap" +# value: { +# string_value: "${enable_trt_overlap}" +# } +# } +parameters: { + key: "exclude_input_in_output" + value: { + string_value: "True" + } +} +parameters: { + key: "cancellation_check_period_ms" + value: { + string_value: "${cancellation_check_period_ms}" + } +} +parameters: { + key: "stats_check_period_ms" + value: { + string_value: "${stats_check_period_ms}" + } +} +parameters: { + key: "iter_stats_max_iterations" + value: { + string_value: "${iter_stats_max_iterations}" + } +} +parameters: { + key: "request_stats_max_iterations" + value: { + string_value: "${request_stats_max_iterations}" + } +} +parameters: { + key: "enable_kv_cache_reuse" + value: { + string_value: "${enable_kv_cache_reuse}" + } +} +parameters: { + key: "normalize_log_probs" + value: { + string_value: "${normalize_log_probs}" + } +} +parameters: { + key: "enable_chunked_context" + value: { + string_value: "${enable_chunked_context}" + } +} +parameters: { + key: "gpu_device_ids" + value: { + string_value: "${gpu_device_ids}" + } +} +parameters: { + key: "lora_cache_optimal_adapter_size" + value: { + string_value: "${lora_cache_optimal_adapter_size}" + } +} +parameters: { + key: "lora_cache_max_adapter_size" + value: { + string_value: "${lora_cache_max_adapter_size}" + } +} +parameters: { + key: "lora_cache_gpu_memory_fraction" + value: { + string_value: "${lora_cache_gpu_memory_fraction}" + } +} +parameters: { + key: "lora_cache_host_memory_bytes" + value: { + string_value: "${lora_cache_host_memory_bytes}" + } +} +parameters: { + key: "decoding_mode" + value: { + string_value: "${decoding_mode}" + } +} +parameters: { + key: "executor_worker_path" + value: { + string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker" + } +} +parameters: { + key: "medusa_choices" + value: { + string_value: "${medusa_choices}" + } +} +parameters: { + key: "gpu_weights_percent" + value: { + string_value: "${gpu_weights_percent}" + } +} diff --git a/qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/decode.py b/qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/decode.py new file mode 100644 index 0000000000..c621cc14b4 --- /dev/null +++ b/qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/decode.py @@ -0,0 +1,347 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from collections.abc import Generator +from dataclasses import dataclass +from typing import Optional + +import numpy as np + + +class RequestValidationError(Exception): + pass + + +def _validate_that(condition: bool, msg: str): + if not condition: + raise RequestValidationError(msg) + + +def _validate_non_empty(data, msg: str): + _validate_that(data is not None and data.size > 0, msg) + + +def _validate_single_gt_0(data, msg: str): + _validate_non_empty(data, msg) + _validate_that(data.flatten()[0] > 0, msg) + + +def _single_value(data: Optional[np.ndarray]): + if data is None: + return None + return data.flatten()[0] + + +@dataclass +class Request: + text_input: np.ndarray = np.array([]) + decoder_text_input: np.ndarray = None + max_tokens: np.ndarray = np.array([]) + bad_words: Optional[np.ndarray] = None + stop_words: Optional[np.ndarray] = None + end_id: Optional[np.ndarray] = None + pad_id: Optional[np.ndarray] = None + top_k: Optional[np.ndarray] = None + top_p: Optional[np.ndarray] = None + temperature: Optional[np.ndarray] = None + length_penalty: Optional[np.ndarray] = None + repetition_penalty: Optional[np.ndarray] = None + min_length: Optional[np.ndarray] = None + return_log_probs: Optional[np.ndarray] = None + prompt_embedding_table: Optional[np.ndarray] = None + prompt_vocab_size: Optional[np.ndarray] = None + embedding_bias_words: Optional[np.ndarray] = None + embedding_bias_weights: Optional[np.ndarray] = None + num_draft_tokens: Optional[np.ndarray] = None + use_draft_logits: Optional[np.ndarray] = None + stream: Optional[np.ndarray] = None + beam_width: Optional[np.ndarray] = None + return_context_logits: Optional[np.ndarray] = None + return_generation_logits: Optional[np.ndarray] = None + random_seed: Optional[np.ndarray] = None + presence_penalty: Optional[np.ndarray] = None + frequency_penalty: Optional[np.ndarray] = None + + def validate(self): + _validate_non_empty(self.text_input, "text_input is required") + _validate_single_gt_0(self.max_tokens, "max_tokens must be a single value > 0") + + num_draft_tokens = _single_value(self.num_draft_tokens) + stream = _single_value(self.stream) + _single_value(self.return_generation_logits) + context_logits = _single_value(self.return_context_logits) + + if num_draft_tokens: + _validate_that( + not stream, "streaming is not supported with speculative decoding" + ) + _validate_that( + not context_logits, + "context logits are not supported with speculative decoding", + ) + + +@dataclass +class DraftRequest: + draft_input_ids: Optional[np.ndarray] = None + draft_logits: Optional[np.ndarray] = None + + +@dataclass +class PreprocResponse: + input_ids: np.ndarray = np.array([]) + decoder_input_ids: np.ndarray = None + input_lengths: np.ndarray = np.array([]) + decoder_input_lengths: np.ndarray = None + bad_words_list: Optional[np.ndarray] = None + stop_words_list: Optional[np.ndarray] = None + embedding_bias: Optional[np.ndarray] = None + end_id: Optional[np.ndarray] = None + pad_id: Optional[np.ndarray] = None + + @classmethod + def with_new_inputs( + cls, + other, + input_ids: Optional[np.ndarray] = None, + input_lengths: Optional[np.ndarray] = None, + ): + return cls( + input_ids=(input_ids if input_ids is not None else other.input_ids), + input_lengths=( + input_lengths if input_lengths is not None else other.input_lengths + ), + decoder_input_ids=other.decoder_input_ids, + decoder_input_lengths=other.decoder_input_lengths, + bad_words_list=other.bad_words_list, + stop_words_list=other.stop_words_list, + end_id=other.end_id, + pad_id=other.pad_id, + ) + + +@dataclass +class GenerationResponse: + output_ids: np.ndarray = np.array([]) + sequence_length: np.ndarray = np.array([]) + cum_log_probs: Optional[np.ndarray] = None + output_log_probs: Optional[np.ndarray] = None + context_logits: Optional[np.ndarray] = None + generation_logits: Optional[np.ndarray] = None + + +@dataclass +class Response: + text_output: np.ndarray = np.array([]) + cum_log_probs: Optional[np.ndarray] = None + output_log_probs: Optional[np.ndarray] = None + context_logits: Optional[np.ndarray] = None + generation_logits: Optional[np.ndarray] = None + + def __eq__(self, o) -> bool: + """Just for testing""" + if not isinstance(o, Response): + return False + return ( + np.array_equal(self.text_output, o.text_output) + and np.array_equal(self.cum_log_probs, o.cum_log_probs) + and np.array_equal(self.output_log_probs, o.output_log_probs) + and np.array_equal(self.context_logits, o.context_logits) + and np.array_equal(self.generation_logits, o.generation_logits) + ) + + +class Decoder: + def __init__(self, streaming=False, accumulate=False): + self._streaming = streaming + self._accumulate = accumulate + + self._accumulated_tokens = None + + def decode( + self, request: Request, speculative_decoding=False + ) -> Generator[Response, None, None]: + preproc_response = self.preprocess(request) + + # print(f"[DEBUG] Decoder.decode {request.temperature=}") + if speculative_decoding: + for gen_response in self._spec_generate(preproc_response, request): + yield self.postprocess(gen_response) + else: + if not self._streaming: + gen_response = self._generate_non_streaming(preproc_response, request) + yield self.postprocess(gen_response) + else: + for gen_response in self._generate(preproc_response, request): + yield self.postprocess(gen_response) + + def encountered_stop_words(self, input_ids, stop_words_ids): + for stop_word_ids in stop_words_ids: + if np.array_equal(input_ids[-len(stop_word_ids) :], stop_word_ids): + return True + return False + + def _spec_generate( + self, preproc: PreprocResponse, request: Request + ) -> Generator[GenerationResponse, None, None]: + prompt_input_ids: np.ndarray = preproc.input_ids[0] + input_ids: np.ndarray = prompt_input_ids + output_len: int = request.max_tokens[0][0] + last_input_ids: np.ndarray = None + draft_output_ids: np.ndarray = None + draft_logits: np.ndarray = None + + target_response: GenerationResponse = None + + cur_preproc = preproc + + counter = 0 + while True: + counter += 1 + num_draft_tokens = min( + request.num_draft_tokens[0][0], + len(prompt_input_ids) + output_len - len(input_ids) - 1, + ) + + draft_request = None + if num_draft_tokens > 0: + draft_response: GenerationResponse = self._draft_generate_non_streaming( + cur_preproc, request, num_draft_tokens + ) + seq_len: int = draft_response.sequence_length[0][0] + # [1, beamWidth, outputLength] -> [outputLen] + draft_output_ids = draft_response.output_ids[0][0] + # [1, beamWidth, outputLength, vocabSizePadded] -> [outputLength, vocabSizePadded] + if request.use_draft_logits is not None and request.use_draft_logits[0]: + if draft_response.generation_logits is not None: + draft_logits = draft_response.generation_logits[0][0] + + input_draft_tokens = draft_output_ids[len(input_ids) : seq_len] + draft_request = DraftRequest( + draft_input_ids=np.expand_dims(input_draft_tokens, 0) + ) + if request.use_draft_logits is not None and request.use_draft_logits[0]: + draft_request.draft_logits = np.expand_dims( + draft_logits[-len(input_draft_tokens) :], 0 + ) + else: + draft_request = DraftRequest() + target_response = self._generate_non_streaming( + cur_preproc, request, draft_request + ) + last_input_ids = input_ids + input_ids = target_response.output_ids[0][0] + cur_preproc = PreprocResponse.with_new_inputs( + cur_preproc, + np.expand_dims(input_ids, 0), + np.array([[len(input_ids)]], dtype=np.int32), + ) + + # Evaluate criteria to stop generation loop. + # If we've hit or exceeded the max output length, should stop + length_stop = len(input_ids) >= len(prompt_input_ids) + output_len + if length_stop: + break + # If draft and target have same outputs, should stop. Normally target should return 1 more token. + # If they are the same length, they should differ at the last token + target_draft_equal = draft_output_ids is not None and np.array_equal( + draft_output_ids, input_ids + ) + if target_draft_equal: + break + # If tokens no longer change, should stop, means we have hit early stopping + last_current_equal = np.array_equal(last_input_ids, input_ids) + if last_current_equal: + break + # Need to check if stop words was encountered + hit_stop_words = self.encountered_stop_words( + input_ids, preproc.stop_words_list[0] + ) + if hit_stop_words: + break + + yield target_response + + def _draft_generate_non_streaming( + self, preproc: PreprocResponse, request: Request, num_draft_tokens: int + ) -> GenerationResponse: + raise NotImplementedError() + + def _generate( + self, + preproc: PreprocResponse, + request: Request, + draft_request: Optional[DraftRequest] = None, + ) -> Generator[GenerationResponse, None, None]: + raise NotImplementedError() + + def _generate_non_streaming( + self, + preproc: PreprocResponse, + request: Request, + draft_request: Optional[DraftRequest] = None, + ) -> GenerationResponse: + raise NotImplementedError() + + def postprocess(self, gen_response: GenerationResponse) -> Response: + if self._accumulate and self._streaming: + new_tokens: np.ndarray = gen_response.output_ids + if new_tokens.ndim != 3: + raise Exception("Expected output_ids tensor to have 3 dims.") + if new_tokens.shape[0] != 1: + raise Exception("Expected batch size of 1") + if new_tokens.shape[1] != 1: + raise Exception( + "Accumulation of tokens is only implemented for beam width = 1" + ) + + self._accumulated_tokens = ( + new_tokens + if (self._accumulated_tokens is None) + else np.concatenate((self._accumulated_tokens, new_tokens), axis=2) + ) + sequence_lengths = np.array( + [[self._accumulated_tokens.shape[2]]], dtype=np.int32 + ) + return self._postprocess( + self._accumulated_tokens, sequence_lengths, gen_response + ) + else: + return self._postprocess(gen_response.output_ids, None, gen_response) + + def _postprocess( + self, + tokens: np.ndarray, + sequence_lengths: Optional[np.ndarray], + gen_response: GenerationResponse, + ) -> Response: + raise NotImplementedError() + + def preprocess(self, request: Request) -> PreprocResponse: + raise NotImplementedError() + + def reset_decoder(self): + self._accumulated_tokens = None diff --git a/qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/triton_decoder.py b/qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/triton_decoder.py new file mode 100644 index 0000000000..62c06f4836 --- /dev/null +++ b/qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/triton_decoder.py @@ -0,0 +1,478 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from collections.abc import Callable +from typing import Dict, Optional + +import numpy as np +import triton_python_backend_utils as pb_utils +from lib.decode import * +from typing_extensions import override + + +class TritonDecoder(Decoder): + def __init__( + self, + streaming=False, + accumulate=False, + preproc_model_name="preprocessing", + postproc_model_name="postprocessing", + llm_model_name="tensorrt_llm", + draft_llm_model_name: Optional[str] = None, + ): + super().__init__(streaming=streaming, accumulate=accumulate) + self.preproc_model_name = preproc_model_name + self.postproc_model_name = postproc_model_name + self.llm_model_name = llm_model_name + self.draft_llm_model_name = draft_llm_model_name + + self._preproc_outputs = [ + "INPUT_ID", + "DECODER_INPUT_ID", + "REQUEST_INPUT_LEN", + "REQUEST_DECODER_INPUT_LEN", + "BAD_WORDS_IDS", + "STOP_WORDS_IDS", + "EMBEDDING_BIAS", + "OUT_PAD_ID", + "OUT_END_ID", + ] + + self._llm_outputs = [ + "output_ids", + "sequence_length", + "cum_log_probs", + "output_log_probs", + "context_logits", + "generation_logits", + ] + + self._postproc_outputs = [ + "OUTPUT", + ] + + self.input_names = [ + "text_input", + "decoder_text_input", + "max_tokens", + "bad_words", + "stop_words", + "end_id", + "pad_id", + "top_k", + "top_p", + "temperature", + "length_penalty", + "repetition_penalty", + "min_length", + "presence_penalty", + "frequency_penalty", + "random_seed", + "return_log_probs", + "return_context_logits", + "return_generation_logits", + "beam_width", + "stream", + "prompt_embedding_table", + "prompt_vocab_size", + "embedding_bias_words", + "embedding_bias_weights", + "num_draft_tokens", + "use_draft_logits", + ] + + self.__undo_reshape_whitelist = { + "max_tokens", + "end_id", + "pad_id", + "top_k", + "top_p", + "temperature", + "length_penalty", + "repetition_penalty", + "min_length", + "presence_penalty", + "frequency_penalty", + "random_seed", + "return_log_probs", + "return_context_logits", + "return_generation_logits", + "beam_width", + "stream", + "prompt_vocab_size", + "num_draft_tokens", + "use_draft_logits", + } + + def _exec_triton_request(self, request): + responses = request.exec(decoupled=True) + for r in responses: + if r.has_error(): + raise pb_utils.TritonModelException(r.error().message()) + yield r + + def _exec_triton_request_single(self, request): + responses = request.exec(decoupled=False) + if responses.has_error(): + raise pb_utils.TritonModelException(responses.error().message()) + return responses + + def create_triton_response(self, response: Response): + name_map = { + "text_output": "text_output", + "cum_log_probs": "cum_log_probs", + "output_log_probs": "output_log_probs", + "context_logits": "context_logits", + "generation_logits": "generation_logits", + } + tensors = self.create_triton_tensors(response, name_map) + return pb_utils.InferenceResponse(output_tensors=tensors) + + def convert_triton_request(self, triton_request) -> Request: + request = Request() + for triton_name in self.input_names: + tensor = pb_utils.get_input_tensor_by_name(triton_request, triton_name) + target_name = triton_name + if tensor is None: + continue + if not hasattr(request, target_name): + raise AttributeError(f"Request has no attribute '{target_name}'") + setattr(request, target_name, tensor.as_numpy()) + return request + + def convert_triton_response( + self, triton_response, response_factory: Callable, name_map=None + ): + response = response_factory() + for tensor in triton_response.output_tensors(): + if tensor is None: + continue + triton_name = tensor.name() + value = tensor.as_numpy() + target_name = triton_name + if name_map and triton_name in name_map: + target_name = name_map[triton_name] + if name_map and not triton_name in name_map: + continue + if target_name is None: + # explicitly ignore this triton input + continue + if not hasattr(response, target_name): + raise AttributeError( + f"response object has not attribute '{target_name}'" + ) + setattr(response, target_name, value) + return response + + def __undo_reshape(self, x, name): + if name in self.__undo_reshape_whitelist and len(x.shape) == 1: + # handle reshapes + return np.expand_dims(x, 0) + else: + return x + + def create_triton_tensors(self, obj, name_map: dict): + tensors = [] + for name, triton_name in name_map.items(): + if triton_name is None: + continue + value = getattr(obj, name) + if value is None: + continue + t = pb_utils.Tensor(triton_name, self.__undo_reshape(value, name)) + tensors.append(t) + return tensors + + @override + def preprocess(self, request: Request) -> PreprocResponse: + input_tensors = self._get_preproc_tensors(request) + triton_req = pb_utils.InferenceRequest( + model_name=self.preproc_model_name, + inputs=input_tensors, + requested_output_names=self._preproc_outputs, + ) + triton_output = self._exec_triton_request_single(triton_req) + return self._get_preproc_response(triton_output) + + def _get_preproc_tensors(self, request: Request): + name_map = { + "text_input": "QUERY", + "decoder_text_input": "DECODER_QUERY", + "max_tokens": "REQUEST_OUTPUT_LEN", + "bad_words": "BAD_WORDS_DICT", + "stop_words": "STOP_WORDS_DICT", + "embedding_bias_words": "EMBEDDING_BIAS_WORDS", + "embedding_bias_weights": "EMBEDDING_BIAS_WEIGHTS", + "pad_id": "PAD_ID", + "end_id": "END_ID", + } + return self.create_triton_tensors(request, name_map) + + def _get_preproc_response(self, triton_output): + name_map = { + "INPUT_ID": "input_ids", + "DECODER_INPUT_ID": "decoder_input_ids", + "REQUEST_INPUT_LEN": "input_lengths", + "REQUEST_DECODER_INPUT_LEN": "decoder_input_lengths", + "BAD_WORDS_IDS": "bad_words_list", + "STOP_WORDS_IDS": "stop_words_list", + "EMBEDDING_BIAS": "embedding_bias", + "OUT_PAD_ID": "pad_id", + "OUT_END_ID": "end_id", + } + return self.convert_triton_response(triton_output, PreprocResponse, name_map) + + @override + def _draft_generate_non_streaming( + self, preproc: PreprocResponse, request: Request, num_draft_tokens: int + ) -> GenerationResponse: + input_tensors = self._get_llm_tensors( + preproc, request, num_draft_tokens, None, True + ) + triton_req = pb_utils.InferenceRequest( + model_name=self.draft_llm_model_name, + inputs=input_tensors, + requested_output_names=self._llm_outputs, + ) + triton_response = self._exec_triton_request_single(triton_req) + llm_response = self._get_llm_response(triton_response) + return llm_response + + @override + def _generate( + self, + preproc: PreprocResponse, + request: Request, + draft_request: Optional[DraftRequest] = None, + ) -> Generator[GenerationResponse, None, None]: + input_tensors = self._get_llm_tensors(preproc, request, None, draft_request) + triton_req = pb_utils.InferenceRequest( + model_name=self.llm_model_name, + inputs=input_tensors, + requested_output_names=self._llm_outputs, + ) + for r in self._exec_triton_request(triton_req): + yield self._get_llm_response(r) + + @override + def _generate_non_streaming( + self, + preproc: PreprocResponse, + request: Request, + draft_request: Optional[DraftRequest] = None, + ) -> GenerationResponse: + input_tensors = self._get_llm_tensors(preproc, request, None, draft_request) + triton_req = pb_utils.InferenceRequest( + model_name=self.llm_model_name, + inputs=input_tensors, + requested_output_names=self._llm_outputs, + ) + r = self._exec_triton_request_single(triton_req) + return self._get_llm_response(r) + + def _get_llm_tensors( + self, + preproc: PreprocResponse, + request: Request, + num_output_tokens: Optional[int] = None, + draft_request: Optional[DraftRequest] = None, + is_draft_model_request: bool = False, + ): + tensors = [] + # print(f"[get_llm_tensors] {request.temperature=}") + tensors.extend(self._get_tensors_from_preproc(preproc)) + tensors.extend( + self._get_llm_tensors_from_request( + request, num_output_tokens, draft_request, is_draft_model_request + ) + ) + return tensors + + def _get_tensors_from_preproc(self, preproc: PreprocResponse): + name_map = { + "input_ids": "input_ids", + "decoder_input_ids": "decoder_input_ids", + "input_lengths": "input_lengths", + "bad_words_list": "bad_words_list", + "stop_words_list": "stop_words_list", + "embedding_bias": "embedding_bias", + "pad_id": "pad_id", + "end_id": "end_id", + } + return self.create_triton_tensors(preproc, name_map) + + def _get_llm_tensors_from_request( + self, + request: Request, + num_output_tokens: Optional[int] = None, + draft_request: Optional[DraftRequest] = None, + is_draft_model_request: bool = False, + ): + name_map: Dict[str, Optional[str]] = { + "beam_width": "beam_width", + "top_k": "runtime_top_k", + "top_p": "runtime_top_p", + # "temperature": "temperature", + "length_penalty": "len_penalty", + "repetition_penalty": "repetition_penalty", + "min_length": "min_length", + "presence_penalty": "presence_penalty", + "frequency_penalty": "frequency_penalty", + "random_seed": "random_seed", + "return_log_probs": "return_log_probs", + "stream": "streaming", + "prompt_embedding_table": "prompt_embedding_table", + "prompt_vocab_size": "prompt_vocab_size", + } + # print(f"[get_llm_tensors_from_request] {request.temperature=}") + temp_found = "temperature" in name_map + # print(f"[get_llm_tensors_from_request] temperature in name_map = {temp_found}") + tensors = self.create_triton_tensors(request, name_map) + + out_len = request.max_tokens[0][0] if request.max_tokens else None + if num_output_tokens is not None: + out_len = num_output_tokens + elif draft_request: + if draft_request.draft_input_ids is not None: + out_len = len(draft_request.draft_input_ids[0]) + 1 + else: + out_len = 1 + + if out_len is None: + raise Exception("Could not determine request_output_len") + else: + tensors.append( + pb_utils.Tensor( + "request_output_len", np.array([[out_len]], dtype=np.int32) + ) + ) + + if draft_request: + if draft_request.draft_input_ids is not None: + tensors.append( + pb_utils.Tensor("draft_input_ids", draft_request.draft_input_ids) + ) + if ( + draft_request.draft_logits is not None + and request.use_draft_logits is not None + and request.use_draft_logits[0] + ): + tensors.append( + pb_utils.Tensor("draft_logits", draft_request.draft_logits) + ) + + return_context_logits = False + return_generation_logits = False + if draft_request is None: + if is_draft_model_request: + return_generation_logits = ( + request.use_draft_logits[0] + if request.use_draft_logits is not None + else False + ) + else: + return_context_logits = ( + request.return_context_logits[0] + if request.return_context_logits is not None + else False + ) + return_generation_logits = ( + request.return_generation_logits[0] + if request.return_generation_logits is not None + else False + ) + + tensors.append( + pb_utils.Tensor( + "return_context_logits", np.array([[return_context_logits]]) + ) + ) + tensors.append( + pb_utils.Tensor( + "return_generation_logits", np.array([[return_generation_logits]]) + ) + ) + return tensors + + def _get_llm_response(self, triton_output): + name_map = { + "output_ids": "output_ids", + "sequence_length": "sequence_length", + "cum_log_probs": "cum_log_probs", + "output_log_probs": "output_log_probs", + "context_logits": "context_logits", + "generation_logits": "generation_logits", + } + return self.convert_triton_response(triton_output, GenerationResponse, name_map) + + def _postprocess( + self, + tokens: np.ndarray, + sequence_lengths: Optional[np.ndarray], + gen_response: GenerationResponse, + ) -> Response: + input_tensors = self._get_postproc_tensors( + tokens, sequence_lengths, gen_response + ) + triton_req = pb_utils.InferenceRequest( + model_name=self.postproc_model_name, + inputs=input_tensors, + requested_output_names=self._postproc_outputs, + ) + r = self._exec_triton_request_single(triton_req) + response = self._get_response(r, gen_response) + return response + + def _get_postproc_tensors( + self, + tokens: np.ndarray, + sequence_lengths: Optional[np.ndarray], + gen_response: GenerationResponse, + ): + tensors = [ + pb_utils.Tensor("TOKENS_BATCH", tokens), + pb_utils.Tensor( + "SEQUENCE_LENGTH", + sequence_lengths if sequence_lengths else gen_response.sequence_length, + ), + ] + return tensors + + def _get_response(self, triton_output, gen_res: GenerationResponse): + tensors = triton_output.output_tensors() + t_map = {} + for named_t in tensors: + name = named_t.name() + t = named_t.as_numpy() + t_map[name] = t + response = Response( + text_output=t_map["OUTPUT"], + cum_log_probs=gen_res.cum_log_probs, + output_log_probs=gen_res.output_log_probs, + context_logits=gen_res.context_logits, + generation_logits=gen_res.generation_logits, + ) + return response diff --git a/qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/model.py b/qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/model.py new file mode 100644 index 0000000000..0a5d54546d --- /dev/null +++ b/qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/model.py @@ -0,0 +1,137 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import traceback + +import triton_python_backend_utils as pb_utils +from lib.triton_decoder import TritonDecoder + + +class TritonPythonModel: + def initialize(self, args): + # Parse model configs + model_config = json.loads(args["model_config"]) + + params = model_config["parameters"] + + accumulate_tokens_str = "" + if "accumulate_tokens" in params: + accumulate_tokens_str = params["accumulate_tokens"]["string_value"] + + self.accumulate_tokens = accumulate_tokens_str.lower() in [ + "true", + "yes", + "1", + "t", + ] + + self.decoupled = pb_utils.using_decoupled_model_transaction_policy(model_config) + + self.logger = pb_utils.Logger + + self.llm_model_name = "tensorrt_llm" + if "tensorrt_llm_model_name" in params: + self.llm_model_name = params["tensorrt_llm_model_name"]["string_value"] + self.draft_llm_model_name = None + if "tensorrt_llm_draft_model_name" in params: + self.draft_llm_model_name = params["tensorrt_llm_draft_model_name"][ + "string_value" + ] + + self.decoder = TritonDecoder( + streaming=self.decoupled, + accumulate=self.accumulate_tokens, + preproc_model_name="preprocessing", + postproc_model_name="postprocessing", + llm_model_name=self.llm_model_name, + draft_llm_model_name=self.draft_llm_model_name, + ) + + def execute(self, requests): + responses = [] + + for request in requests: + if self.decoupled: + response_sender = request.get_response_sender() + try: + req = self.decoder.convert_triton_request(request) + req.validate() + # print(f"[DEBUG] ========= [bls model.py] {req.temperature=} ===========") + speculative_decode = ( + req.num_draft_tokens is not None and req.num_draft_tokens[0][0] > 0 + ) + if speculative_decode and ( + self.draft_llm_model_name is None or self.draft_llm_model_name == "" + ): + raise Exception( + "cannot perform speculative decoding without draft model" + ) + res_gen = self.decoder.decode( + req, speculative_decoding=speculative_decode + ) + + for res in res_gen: + triton_response = self.decoder.create_triton_response(res) + if self.decoupled: + response_sender.send(triton_response) + else: + responses.append(triton_response) + + if self.decoupled: + response_sender.send( + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) + + except Exception: + self.logger.log_error(traceback.format_exc()) + # If encountering an error, send a response with err msg + error_response = pb_utils.InferenceResponse( + output_tensors=[], + error=pb_utils.TritonError(traceback.format_exc()), + ) + + if self.decoupled: + response_sender.send(error_response) + response_sender.send( + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) + else: + responses.append(error_response) + + self.decoder.reset_decoder() + if self.decoupled: + return None + else: + assert len(responses) == len(requests) + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is optional. This function allows + the model to perform any necessary clean ups before exit. + """ + print("Cleaning up...") diff --git a/qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm_bls/config.pbtxt b/qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm_bls/config.pbtxt new file mode 100644 index 0000000000..aa3b26336c --- /dev/null +++ b/qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm_bls/config.pbtxt @@ -0,0 +1,252 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +backend: "python" +max_batch_size: 256 + +model_transaction_policy { + decoupled: True +} + +input [ + { + name: "text_input" + data_type: TYPE_STRING + dims: [ -1 ] + }, + { + name: "decoder_text_input" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true + }, + { + name: "max_tokens" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "bad_words" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true + }, + { + name: "stop_words" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true + }, + { + name: "end_id" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + }, + { + name: "pad_id" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + }, + { + name: "top_k" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + }, + { + name: "top_p" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "temperature" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "length_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "repetition_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "min_length" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + }, + { + name: "presence_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "frequency_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "random_seed" + data_type: TYPE_UINT64 + dims: [ 1 ] + optional: true + }, + { + name: "return_log_probs" + data_type: TYPE_BOOL + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "return_context_logits" + data_type: TYPE_BOOL + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "return_generation_logits" + data_type: TYPE_BOOL + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "beam_width" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + }, + { + name: "stream" + data_type: TYPE_BOOL + dims: [ 1 ] + optional: true + }, + { + name: "prompt_embedding_table" + data_type: TYPE_FP16 + dims: [ -1, -1 ] + optional: true + }, + { + name: "prompt_vocab_size" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + }, + { + name: "embedding_bias_words" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true + }, + { + name: "embedding_bias_weights" + data_type: TYPE_FP32 + dims: [ -1 ] + optional: true + }, + { + name: "num_draft_tokens", + data_type: TYPE_INT32, + dims: [ 1 ] + optional: true + }, + { + name: "use_draft_logits", + data_type: TYPE_BOOL, + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + } +] +output [ + { + name: "text_output" + data_type: TYPE_STRING + dims: [ -1 ] + }, + { + name: "cum_log_probs" + data_type: TYPE_FP32 + dims: [ -1 ] + }, + { + name: "output_log_probs" + data_type: TYPE_FP32 + dims: [ -1, -1 ] + }, + { + name: "context_logits" + data_type: TYPE_FP32 + dims: [ -1, -1 ] + }, + { + name: "generation_logits" + data_type: TYPE_FP32 + dims: [ -1, -1, -1 ] + } +] + +parameters: { + key: "accumulate_tokens" + value: { + string_value: "${accumulate_tokens}" + } +} +parameters: { + key: "tensorrt_llm_model_name" + value: { + string_value: "tensorrt_llm" + } +} +parameters: { + key: "tensorrt_llm_draft_model_name" + value: { + string_value: "" + } +} + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] diff --git a/qa/L0_openai/example/src/tests/test_chat_completions.py b/qa/L0_openai/example/src/tests/test_chat_completions.py index e0a31af3e6..493bd5eafd 100644 --- a/qa/L0_openai/example/src/tests/test_chat_completions.py +++ b/qa/L0_openai/example/src/tests/test_chat_completions.py @@ -1,37 +1,90 @@ +import copy import os from pathlib import Path import pytest from fastapi.testclient import TestClient -from src.api_server import app +from src.api_server import init_app -TEST_MODEL = "gpt2" -TEST_PROMPT = "What is the capital of France?" +### TEST ENVIRONMENT SETUP ### +TEST_BACKEND = "" +TEST_MODEL = "" +TEST_PROMPT = "What is machine learning?" +TEST_MESSAGES = [{"role": "user", "content": TEST_PROMPT}] +TEST_TOKENIZER = "meta-llama/Meta-Llama-3-8B-Instruct" +try: + import vllm as _ + + TEST_BACKEND = "vllm" + TEST_MODEL = "llama-3-8b-instruct" +except ImportError: + pass + +try: + import tensorrt_llm as _ + + TEST_BACKEND = "tensorrtllm" + TEST_MODEL = "tensorrt_llm_bls" +except ImportError: + pass + +if not TEST_BACKEND or not TEST_MODEL: + raise Exception("Unknown test environment") +### -# TODO: Test TRTLLM too class TestChatCompletions: # TODO: Consider module/package scope, or join Completions tests into same file # to run server only once for both sets of tests for faster iteration. - @pytest.fixture(scope="class", autouse=True) + @pytest.fixture(scope="class", autouse=False) def client(self): - # TODO: Test TRT-LLM models as well - model_repository = Path(__file__).parent / "vllm_models" - os.environ["TRITON_MODEL_REPOSITORY"] = str(model_repository) + model_repository = str(Path(__file__).parent / f"{TEST_BACKEND}_models") + app = self.setup_app( + tokenizer=TEST_TOKENIZER, model_repository=model_repository + ) with TestClient(app) as test_client: yield test_client - def test_chat_completions_defaults(self, client): - messages = [{"role": "user", "content": TEST_PROMPT}] + def setup_app(self, tokenizer: str, model_repository: str): + os.environ["TOKENIZER"] = tokenizer + os.environ["TRITON_MODEL_REPOSITORY"] = model_repository + app = init_app() + return app + + # A TOKENIZER must be known for /chat/completions endpoint in order to + # apply chat templates, and for simplicity in determination, users should + # define the TOKENIZER. So, explicitly raise an error if none is provided. + def test_chat_completions_no_tokenizer(self): + model_repository = str(Path(__file__).parent / f"{TEST_BACKEND}_models") + app = self.setup_app(tokenizer="", model_repository=model_repository) + with TestClient(app) as client: + response = client.post( + "/v1/chat/completions", + json={"model": TEST_MODEL, "messages": TEST_MESSAGES}, + ) + assert response.status_code == 400 + assert response.json()["detail"] == "Unknown tokenizer" + + @pytest.mark.skip(reason="Not Implemented Yet") + def test_chat_completions_streaming(self, client): + pass + + @pytest.mark.skip(reason="Not Implemented Yet") + def test_chat_completions_no_streaming(self, client): + pass + def test_chat_completions_defaults(self, client): response = client.post( - "/v1/chat/completions", json={"model": TEST_MODEL, "messages": messages} + "/v1/chat/completions", + json={"model": TEST_MODEL, "messages": TEST_MESSAGES}, ) assert response.status_code == 200 message = response.json()["choices"][0]["message"] assert message["content"].strip() assert message["role"] == "assistant" + # "usage" currently not supported + assert response.json()["usage"] == None def test_chat_completions_system_prompt(self, client): # NOTE: Currently just sanity check that there are no issues when a @@ -82,11 +135,13 @@ def test_chat_completions_system_prompt_only(self, client): def test_chat_completions_sampling_parameters( self, client, sampling_parameter, value ): - messages = [{"role": "user", "content": TEST_PROMPT}] - response = client.post( "/v1/chat/completions", - json={"model": TEST_MODEL, "messages": messages, sampling_parameter: value}, + json={ + "model": TEST_MODEL, + "messages": TEST_MESSAGES, + sampling_parameter: value, + }, ) # TODO: Add support and remove this check @@ -116,11 +171,13 @@ def test_chat_completions_sampling_parameters( def test_chat_completions_invalid_sampling_parameters( self, client, sampling_parameter, value ): - messages = [{"role": "user", "content": TEST_PROMPT}] - response = client.post( "/v1/chat/completions", - json={"model": TEST_MODEL, "messages": messages, sampling_parameter: value}, + json={ + "model": TEST_MODEL, + "messages": TEST_MESSAGES, + sampling_parameter: value, + }, ) print("Response:", response.json()) @@ -129,8 +186,7 @@ def test_chat_completions_invalid_sampling_parameters( # Simple tests to verify max_tokens roughly behaves as expected def test_chat_completions_max_tokens(self, client): responses = [] - messages = [{"role": "user", "content": TEST_PROMPT}] - payload = {"model": TEST_MODEL, "messages": messages, "max_tokens": 1} + payload = {"model": TEST_MODEL, "messages": TEST_MESSAGES, "max_tokens": 1} # Send two requests with max_tokens = 1 to check their similarity payload["max_tokens"] = 1 @@ -172,17 +228,17 @@ def test_chat_completions_max_tokens(self, client): assert len(response1_text) == len(response2_text) == 1 assert len(response3_text) > len(response1_text) + @pytest.mark.skipif(TEST_BACKEND != "vllm", reason="Only used to test vLLM backend") @pytest.mark.parametrize( "temperature", [0.0, 1.0], ) # Simple tests to verify temperature roughly behaves as expected - def test_chat_completions_temperature(self, client, temperature): + def test_chat_completions_temperature_vllm(self, client, temperature): responses = [] - messages = [{"role": "user", "content": TEST_PROMPT}] payload = { "model": TEST_MODEL, - "messages": messages, + "messages": TEST_MESSAGES, "temperature": temperature, } @@ -227,6 +283,119 @@ def test_chat_completions_temperature(self, client, temperature): else: raise ValueError(f"Unexpected {temperature=} for this test.") + # Remove xfail when fix is released and this test returns xpass status + @pytest.mark.xfail( + reason="TRT-LLM BLS model will ignore temperature until a later release" + ) + @pytest.mark.skipif( + TEST_BACKEND != "tensorrtllm", reason="Only used to test TRT-LLM backend" + ) + # Simple tests to verify temperature roughly behaves as expected + def test_chat_completions_temperature_tensorrtllm(self, client): + responses = [] + payload1 = { + "model": TEST_MODEL, + "messages": TEST_MESSAGES, + # Increase token length to allow more room for variability + "max_tokens": 200, + "temperature": 0.0, + # TRT-LLM requires certain settings of `top_k` / `top_p` to + # respect changes in `temperature` + "top_p": 0.5, + } + + payload2 = copy.deepcopy(payload1) + payload2["temperature"] = 1.0 + + # First 2 responses should be the same in TRT-LLM with identical payload + responses.append( + client.post( + "/v1/chat/completions", + json=payload1, + ) + ) + responses.append( + client.post( + "/v1/chat/completions", + json=payload1, + ) + ) + # Third response should differ with different temperature in payload + responses.append( + client.post( + "/v1/chat/completions", + json=payload2, + ) + ) + + for response in responses: + print("Response:", response.json()) + assert response.status_code == 200 + + response1_text = ( + responses[0].json()["choices"][0]["message"]["content"].strip().split() + ) + response2_text = ( + responses[1].json()["choices"][0]["message"]["content"].strip().split() + ) + response3_text = ( + responses[2].json()["choices"][0]["message"]["content"].strip().split() + ) + + assert response1_text == response2_text + assert response1_text != response3_text + + # Simple tests to verify seed roughly behaves as expected + def test_chat_completions_seed(self, client): + responses = [] + payload1 = { + "model": TEST_MODEL, + "messages": TEST_MESSAGES, + # Increase token length to allow more room for variability + "max_tokens": 200, + "seed": 1, + } + payload2 = copy.deepcopy(payload1) + payload2["seed"] = 2 + + # First 2 responses should be the same in both vLLM and TRT-LLM with identical seed + responses.append( + client.post( + "/v1/chat/completions", + json=payload1, + ) + ) + responses.append( + client.post( + "/v1/chat/completions", + json=payload1, + ) + ) + # Third response should differ with different seed in payload + responses.append( + client.post( + "/v1/chat/completions", + json=payload2, + ) + ) + + for response in responses: + print("Response:", response.json()) + assert response.status_code == 200 + + response1_text = ( + responses[0].json()["choices"][0]["message"]["content"].strip().split() + ) + response2_text = ( + responses[1].json()["choices"][0]["message"]["content"].strip().split() + ) + response3_text = ( + responses[2].json()["choices"][0]["message"]["content"].strip().split() + ) + + assert response1_text == response2_text + assert response1_text != response3_text + def test_chat_completions_no_message(self, client): # Message validation requires min_length of 1 messages = [] @@ -248,6 +417,15 @@ def test_chat_completions_empty_message(self, client): assert response.status_code == 422 assert response.json()["detail"][0]["msg"] == "Field required" + def test_chat_completions_multiple_choices(self, client): + response = client.post( + "/v1/chat/completions", + json={"model": TEST_MODEL, "messages": TEST_MESSAGES, "n": 2}, + ) + + assert response.status_code == 400 + assert response.json()["detail"] == "Only single choice is supported" + @pytest.mark.skip(reason="Not Implemented Yet") def test_function_calling(self): pass @@ -260,9 +438,7 @@ def test_lora(self): def test_multi_lora(self): pass - # TODO: Test for handling invalid messages or payloads - # TODO: test chat/instruct model? gpt2 logs error about lack of chat template - # TODO: test roles? - # TODO: function calling? - # TODO: lora / multi-lora? - # TODO: genai-perf test? + # TODO: Do we want to support "usage" field for token counts in response? + @pytest.mark.skip(reason="Not Implemented Yet") + def test_usage_response(self): + pass diff --git a/qa/L0_openai/example/src/tests/test_completions.py b/qa/L0_openai/example/src/tests/test_completions.py index 84a21e13d3..2b2c78c548 100644 --- a/qa/L0_openai/example/src/tests/test_completions.py +++ b/qa/L0_openai/example/src/tests/test_completions.py @@ -1,12 +1,34 @@ +import copy import os from pathlib import Path import pytest from fastapi.testclient import TestClient -from src.api_server import app +from src.api_server import init_app -TEST_MODEL = "gpt2" -TEST_PROMPT = "The capital of France is" +### TEST ENVIRONMENT SETUP ### +TEST_BACKEND = "" +TEST_MODEL = "" +TEST_PROMPT = "Machine learning is" +try: + import vllm as _ + + TEST_BACKEND = "vllm" + TEST_MODEL = "llama-3-8b-instruct" +except ImportError: + pass + +try: + import tensorrt_llm as _ + + TEST_BACKEND = "tensorrtllm" + TEST_MODEL = "tensorrt_llm_bls" +except ImportError: + pass + +if not TEST_BACKEND or not TEST_MODEL: + raise Exception("Unknown test environment") +### class TestCompletions: @@ -14,9 +36,9 @@ class TestCompletions: # to run server only once for both sets of tests for faster iteration. @pytest.fixture(scope="class") def client(self): - # TODO: Test TRT-LLM models as well - model_repository = Path(__file__).parent / "vllm_models" + model_repository = Path(__file__).parent / f"{TEST_BACKEND}_models" os.environ["TRITON_MODEL_REPOSITORY"] = str(model_repository) + app = init_app() with TestClient(app) as test_client: yield test_client @@ -31,6 +53,8 @@ def test_completions_defaults(self, client): # NOTE: Could be improved to look for certain quality of response, # or tested with dummy identity model. assert response.json()["choices"][0]["text"].strip() + # "usage" currently not supported + assert response.json()["usage"] == None @pytest.mark.parametrize( "sampling_parameter, value", @@ -105,6 +129,149 @@ def test_completions_max_tokens(self, client): assert len(response1_text) == len(response2_text) == 1 assert len(response3_text) > len(response1_text) + @pytest.mark.skipif(TEST_BACKEND != "vllm", reason="Only used to test vLLM backend") + @pytest.mark.parametrize( + "temperature", + [0.0, 1.0], + ) + # Simple tests to verify temperature roughly behaves as expected + def test_completions_temperature_vllm(self, client, temperature): + responses = [] + payload = { + "model": TEST_MODEL, + "prompt": TEST_PROMPT, + "temperature": temperature, + } + + responses.append( + client.post( + "/v1/completions", + json=payload, + ) + ) + responses.append( + client.post( + "/v1/completions", + json=payload, + ) + ) + + for response in responses: + print("Response:", response.json()) + assert response.status_code == 200 + + response1_text = responses[0].json()["choices"][0]["text"].strip().split() + response2_text = responses[1].json()["choices"][0]["text"].strip().split() + + # Temperature of 0.0 indicates greedy sampling, so check + # that two equivalent requests produce the same response. + if temperature == 0.0: + # NOTE: This check may be ambitious to get an exact match in all + # frameworks depending on how other parameter defaults are set, so + # it can probably be removed if it introduces flakiness. + print(f"Comparing '{response1_text}' == '{response2_text}'") + assert response1_text == response2_text + # Temperature of 1.0 indicates maximum randomness, so check + # that two equivalent requests produce different responses. + elif temperature == 1.0: + print(f"Comparing '{response1_text}' != '{response2_text}'") + assert response1_text != response2_text + # Don't bother checking values other than the extremes + else: + raise ValueError(f"Unexpected {temperature=} for this test.") + + # Remove xfail when fix is released and this test returns xpass status + @pytest.mark.xfail( + reason="TRT-LLM BLS model will ignore temperature until a later release" + ) + @pytest.mark.skipif( + TEST_BACKEND != "tensorrtllm", reason="Only used to test TRT-LLM backend" + ) + # Simple tests to verify temperature roughly behaves as expected + def test_completions_temperature_tensorrtllm(self, client): + responses = [] + payload1 = { + "model": TEST_MODEL, + "prompt": TEST_PROMPT, + "temperature": 0.0, + # TRT-LLM requires certain settings of `top_k` / `top_p` to + # respect changes in `temperature` + "top_p": 0.5, + } + payload2 = copy.deepcopy(payload1) + payload2["temperature"] = 1.0 + + # First 2 responses should be the same in TRT-LLM with identical payload + responses.append( + client.post( + "/v1/completions", + json=payload1, + ) + ) + responses.append( + client.post( + "/v1/completions", + json=payload1, + ) + ) + # Third response should differ with different temperature in payload + responses.append( + client.post( + "/v1/completions", + json=payload2, + ) + ) + + for response in responses: + print("Response:", response.json()) + assert response.status_code == 200 + + response1_text = responses[0].json()["choices"][0]["text"].strip().split() + response2_text = responses[1].json()["choices"][0]["text"].strip().split() + response3_text = responses[2].json()["choices"][0]["text"].strip().split() + + assert response1_text == response2_text + assert response1_text != response3_text + + # Simple tests to verify seed roughly behaves as expected + def test_completions_seed(self, client): + responses = [] + payload1 = {"model": TEST_MODEL, "prompt": TEST_PROMPT, "seed": 1} + payload2 = copy.deepcopy(payload1) + payload2["seed"] = 2 + + # First 2 responses should be the same in TRT-LLM with identical payload + responses.append( + client.post( + "/v1/completions", + json=payload1, + ) + ) + responses.append( + client.post( + "/v1/completions", + json=payload1, + ) + ) + # Third response should differ with different temperature in payload + responses.append( + client.post( + "/v1/completions", + json=payload2, + ) + ) + + for response in responses: + print("Response:", response.json()) + assert response.status_code == 200 + + response1_text = responses[0].json()["choices"][0]["text"].strip().split() + response2_text = responses[1].json()["choices"][0]["text"].strip().split() + response3_text = responses[2].json()["choices"][0]["text"].strip().split() + + assert response1_text == response2_text + assert response1_text != response3_text + @pytest.mark.parametrize( "sampling_parameter, value", [ @@ -133,6 +300,14 @@ def test_completions_invalid_sampling_parameters( print("Response:", response.json()) assert response.status_code == 422 + def test_completions_empty_request(self, client): + response = client.post("/v1/completions", json={}) + assert response.status_code == 422 + + def test_completions_no_model(self, client): + response = client.post("/v1/completions", json={"prompt": TEST_PROMPT}) + assert response.status_code == 422 + def test_completions_no_prompt(self, client): response = client.post("/v1/completions", json={"model": TEST_MODEL}) assert response.status_code == 422 @@ -152,6 +327,14 @@ def test_no_prompt(self, client): # 422 Error returned by schema validation assert response.status_code == 422 + def test_completions_multiple_choices(self, client): + response = client.post( + "/v1/completions", json={"model": TEST_MODEL, "prompt": TEST_PROMPT, "n": 2} + ) + + assert response.status_code == 400 + assert response.json()["detail"] == "Only single choice is supported" + @pytest.mark.skip(reason="Not Implemented Yet") def test_lora(self): pass @@ -159,3 +342,8 @@ def test_lora(self): @pytest.mark.skip(reason="Not Implemented Yet") def test_multi_lora(self): pass + + # TODO: Do we want to support "usage" field for token counts in response? + @pytest.mark.skip(reason="Not Implemented Yet") + def test_usage_response(self): + pass diff --git a/qa/L0_openai/example/src/tests/test_observability.py b/qa/L0_openai/example/src/tests/test_observability.py new file mode 100644 index 0000000000..67e1f42255 --- /dev/null +++ b/qa/L0_openai/example/src/tests/test_observability.py @@ -0,0 +1,67 @@ +import os +from pathlib import Path + +import pytest +from fastapi.testclient import TestClient +from src.api_server import init_app + +TEST_MODEL = "mock_llm" + + +class TestObservability: + @pytest.fixture(scope="class") + def client(self): + model_repository = Path(__file__).parent / "test_models" + os.environ["TRITON_MODEL_REPOSITORY"] = str(model_repository) + app = init_app() + with TestClient(app) as test_client: + yield test_client + + ### General Error Handling ### + def test_not_found(self, client): + response = client.get("/does-not-exist") + assert response.status_code == 404 + + ### Startup / Health ### + def test_startup_success(self, client): + response = client.get("/health") + assert response.status_code == 200 + + def test_startup_fail(self): + os.environ["TRITON_MODEL_REPOSITORY"] = "/does/not/exist" + with pytest.raises(Exception): + # Test that FastAPI lifespan startup fails when initializing Triton + # with unknown model repository. + app = init_app() + with TestClient(app): + pass + + ### Metrics ### + def test_startup_metrics(self, client): + response = client.get("/metrics") + assert response.status_code == 200 + # FIXME: Flesh out more + # NOTE: response.json() works even on non-json prometheus data? + assert "nv_cpu_utilization" in response.json() + + ### Models ### + def test_models_list(self, client): + # TODO: Load multiple models and make sure exactly ALL are returned + response = client.get("/v1/models") + assert response.status_code == 200 + # TODO: Flesh out + models = response.json()["data"] + assert len(models) == 1 + assert models[0]["id"] == TEST_MODEL + assert models[0]["object"] == "model" + assert models[0]["created"] > 0 + + def test_models_get(self, client): + # TODO: Load multiple models and make sure exactly 1 is returned + response = client.get(f"/v1/models/{TEST_MODEL}") + assert response.status_code == 200 + # TODO: Flesh out + model = response.json() + assert model["id"] == TEST_MODEL + assert model["object"] == "model" + assert model["created"] > 0 diff --git a/qa/L0_openai/example/src/tests/test_utilities.py b/qa/L0_openai/example/src/tests/test_utilities.py deleted file mode 100644 index 87470f83a8..0000000000 --- a/qa/L0_openai/example/src/tests/test_utilities.py +++ /dev/null @@ -1,79 +0,0 @@ -import os -from pathlib import Path - -import pytest -from fastapi.testclient import TestClient -from src.api_server import app - -TEST_MODEL = "mock_llm" - - -# TODO: May need to modify fixture scope -@pytest.fixture(scope="function", autouse=True) -def setup_model_repository(): - model_repository = Path(__file__).parent / "test_models" - os.environ["TRITON_MODEL_REPOSITORY"] = str(model_repository) - - -def test_not_found(): - with TestClient(app) as client: - response = client.get("/does-not-exist") - assert response.status_code == 404 - - -### Startup / Health ### - - -def test_startup_success(): - with TestClient(app) as client: - response = client.get("/health") - assert response.status_code == 200 - - -def test_startup_fail(): - os.environ["TRITON_MODEL_REPOSITORY"] = "/does/not/exist" - with pytest.raises(Exception): - # Test that FastAPI lifespan startup fails when initializing Triton - # with unknown model repository. - with TestClient(app): - pass - - -### Metrics ### - - -def test_startup_metrics(): - with TestClient(app) as client: - response = client.get("/metrics") - assert response.status_code == 200 - # FIXME: Flesh out more - # NOTE: response.json() works even on non-json prometheus data? - assert "nv_cpu_utilization" in response.json() - - -### Models ### - - -def test_models_list(): - # TODO: Load multiple models and make sure exactly ALL are returned - with TestClient(app) as client: - response = client.get("/v1/models") - assert response.status_code == 200 - # TODO: Flesh out - models = response.json()["data"] - assert len(models) == 1 - assert models[0]["id"] == TEST_MODEL - assert models[0]["object"] == "model" - assert models[0]["created"] > 0 - - -def test_models_get(): - # TODO: Load multiple models and make sure exactly 1 is returned - with TestClient(app) as client: - response = client.get(f"/v1/models/{TEST_MODEL}") - assert response.status_code == 200 - # TODO: Flesh out - model = response.json() - assert model["id"] == TEST_MODEL - assert model["object"] == "model" - assert model["created"] > 0 diff --git a/qa/L0_openai/example/src/tests/vllm_models/gpt2/1/model.json b/qa/L0_openai/example/src/tests/vllm_models/gpt2/1/model.json deleted file mode 100644 index 96f398c471..0000000000 --- a/qa/L0_openai/example/src/tests/vllm_models/gpt2/1/model.json +++ /dev/null @@ -1 +0,0 @@ -{"model": "gpt2", "disable_log_requests": true, "gpu_memory_utilization": 0.85} \ No newline at end of file diff --git a/qa/L0_openai/example/src/tests/vllm_models/llama-3-8b-instruct/1/model.json b/qa/L0_openai/example/src/tests/vllm_models/llama-3-8b-instruct/1/model.json new file mode 100644 index 0000000000..e60275ce16 --- /dev/null +++ b/qa/L0_openai/example/src/tests/vllm_models/llama-3-8b-instruct/1/model.json @@ -0,0 +1 @@ +{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "disable_log_requests": true, "gpu_memory_utilization": 0.85} \ No newline at end of file diff --git a/qa/L0_openai/example/src/tests/vllm_models/gpt2/config.pbtxt b/qa/L0_openai/example/src/tests/vllm_models/llama-3-8b-instruct/config.pbtxt similarity index 100% rename from qa/L0_openai/example/src/tests/vllm_models/gpt2/config.pbtxt rename to qa/L0_openai/example/src/tests/vllm_models/llama-3-8b-instruct/config.pbtxt diff --git a/qa/L0_openai/example/src/utils/triton.py b/qa/L0_openai/example/src/utils/triton.py index 498e0f331e..594eeed08b 100644 --- a/qa/L0_openai/example/src/utils/triton.py +++ b/qa/L0_openai/example/src/utils/triton.py @@ -10,12 +10,11 @@ # TODO: Refactor # NOTE: Allow python backend for testing purposes -# TODO: How did this interact with BLS/TRTLLM models before this change? SUPPORTED_BACKENDS: set = {"vllm", "tensorrtllm", "python"} -LLM_BACKENDS = {"vllm", "tensorrtllm"} # TODO -KNOWN_MODELS = {"gpt2": "hf:gpt2"} +LLM_BACKENDS: set = {"vllm", "tensorrtllm"} +# TODO: pydantic validation? @dataclass class TritonModelMetadata: # Name used in Triton model repository @@ -24,48 +23,96 @@ class TritonModelMetadata: backend: str # Triton model object handle model: tritonserver.Model - # TODO: Address typing - tokenizer: typing.Any - # Name in terms of a HuggingFace model or remote model registry name - source_name: str + tokenizer: typing.Optional[typing.Any] # Time that model was loaded by Triton create_time: int + # TODO: Address typing + request_convert_fn: typing.Optional[typing.Any] + + +def determine_request_format(backend): + # Request conversion from OpenAI format to backend-specific format + if backend == "vllm": + request_convert_fn = create_vllm_inference_request + # Python included to support TRT-LLM BLS model and TRT-LLM python runtime + elif backend in ["tensorrtllm", "python"]: + request_convert_fn = create_trtllm_inference_request + else: + request_convert_fn = None + + return request_convert_fn + + +# TODO: Refactor: +# NOTE: We need to figure out a few things while looking at the models in the +# triton model repository. +# 1. Which model should we interact with when sending requests to Triton core? +# a. For a single model, this is trivial, and would support any backend. +# b. For TRT-LLM, this should be 'ensemble' or 'tensorrt_llm_bls' following +# TRT-LLM defaults/examples. However, this could also be renamed by the user +# to have a more intuitive front-facing name, such as "llama3-8b". Note that +# TRT-LLM pipelines produced by the Triton CLI will generally be renamed like +# this. FIXME: This is a relatively fragile flow and should be improved. +# 2. Which tokenizer to use for things like applying a chat template or making +# a tool/function call. These are primarily relevant for the /chat/completions +# endpoint, but not the /completions endpoint. +# - For now, require user-defined TOKENIZER for simplicity. +# 3. Which inputs/outputs/parameters should be set when creating the underlying +# triton inference request? The inference request fields required will differ +# for vLLM, TRT-LLM, and user-defined models like a custom python model. So we +# need to know how to correctly translate the OpenAI schema parameters to +# a triton inference request. +# - For now, we will look for either vllm or trtllm in list of loaded backends, +# and we consider python==trtllm for now due to possibility of python runtime. +# We may want to consider using Triton's "runtime" config field for this for +# easier detection instead. +def load_models(server): + model_metadatas = [] + backends = [] + # TODO: Support tokenizers more generically or custom tokenizers, possibly + # by looking for tokenizer.json in a pre-specified location? + tokenizer = None + tokenizer_model = os.environ.get("TOKENIZER") + if tokenizer_model: + print(f"Using env var TOKENIZER={tokenizer_model} to determine the tokenizer") + tokenizer = get_tokenizer(tokenizer_model) -# TODO: Refactor - this function seems to load a single model, -# but iterates through all models? -def load_model(server): - model = None + models = [] backends = [] - tokenizer = None - source_name = None - model_name = None - for model_name, version in server.models().keys(): + names = [] + # Load all triton models and gather the respective backends of each + for name, version in server.models().keys(): + # TODO: Why skip known version? Already loaded? if version != -1: continue - model = server.load(model_name) - backends.append(model.config()["backend"]) - if model_name in KNOWN_MODELS.keys(): - source_name = KNOWN_MODELS[model_name].replace("hf:", "") - tokenizer = get_tokenizer(source_name) + + model = server.load(name) + backend = model.config()["backend"] + + names.append(name) + models.append(model) + backends.append(backend) + print(f"Loaded: Model={name}, Backend={backend}.") create_time = int(time.time()) - backend = None - for be in backends: - if be in SUPPORTED_BACKENDS: - backend = be - break - - # TODO - return TritonModelMetadata( - name=model_name, - backend=backend, - model=model, - tokenizer=tokenizer, - source_name=source_name, - create_time=create_time, - ) + + # One tokenizer, convert function, and creation time for all loaded models. + # NOTE: This doesn't currently support having both a vLLM and TRT-LLM + # model loaded at the same time. + for name, model, backend in zip(names, models, backends): + metadata = TritonModelMetadata( + name=name, + backend=backend, + model=model, + tokenizer=tokenizer, + create_time=create_time, + request_convert_fn=determine_request_format(backend), + ) + model_metadatas.append(metadata) + + return model_metadatas def init_tritonserver(): @@ -84,37 +131,9 @@ def init_tritonserver(): model_control_mode=tritonserver.ModelControlMode.EXPLICIT, ).start(wait_until_ready=True) - # TODO: Cleanup print("Loading Model...\n\n") - - # model, model_create_time, backend, tokenizer, _ = load_model(server) - metadata = load_model(server) - - # TODO: pydantic validation? - if not metadata.name: - raise Exception("Unknown Model Name") - - if not metadata.model: - raise Exception("Unknown Model") - - if not metadata.backend: - raise Exception("Unsupported Backend") - - # NOTE: Allow no tokenizer for mock python model for testing purposes - if not metadata.tokenizer and metadata.backend in LLM_BACKENDS: - raise Exception("Unsupported Tokenizer") - - if not metadata.create_time: - raise Exception("Unknown Model Creation Time") - - print(f"\n\nModel: {metadata.name} Loaded with Backend: {metadata.backend}\n\n") - - # if backend == "vllm": - # create_inference_request = create_vllm_inference_request - # elif backend == "tensorrtllm": - # create_inference_request = create_trtllm_inference_request - - return server, metadata + metadatas = load_models(server) + return server, metadatas def get_output(response): @@ -147,8 +166,8 @@ def create_vllm_inference_request( if echo: exclude_input_in_output = not echo inputs["exclude_input_in_output"] = [exclude_input_in_output] - print(f"[DEBUG] {inputs=}") + print(f"[DEBUG] Triton Inference Request {inputs=}") return model.create_request(inputs=inputs, parameters=sampling_parameters) @@ -157,8 +176,6 @@ def create_trtllm_inference_request( model, prompt, request: CreateChatCompletionRequest | CreateCompletionRequest ): inputs = {} - if model.name == "llama-3-8b-instruct": - inputs["stop_words"] = [["<|eot_id|>", "<|end_of_text|>"]] inputs["text_input"] = [[prompt]] inputs["stream"] = [[request.stream]] if request.max_tokens: @@ -167,15 +184,17 @@ def create_trtllm_inference_request( if isinstance(request.stop, str): request.stop = [request.stop] inputs["stop_words"] = [request.stop] - if request.top_p: + # Check "is not None" specifically, because values of zero are valid. + if request.top_p is not None: inputs["top_p"] = np.float32([[request.top_p]]) - if request.frequency_penalty: + if request.frequency_penalty is not None: inputs["frequency_penalty"] = np.float32([[request.frequency_penalty]]) - if request.presence_penalty: - inputs["presence_penalty":] = np.int32([[request.presence_penalty]]) - if request.seed: + if request.presence_penalty is not None: + inputs["presence_penalty"] = np.float32([[request.presence_penalty]]) + if request.seed is not None: inputs["random_seed"] = np.uint64([[request.seed]]) - if request.temperature: + if request.temperature is not None: inputs["temperature"] = np.float32([[request.temperature]]) + print(f"[DEBUG] Triton Inference Request {inputs=}") return model.create_request(inputs=inputs) From 567abf3c2f70ef64f204dce8b2545524092cc54a Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Wed, 14 Aug 2024 14:43:42 -0700 Subject: [PATCH 14/80] Add streaming test placeholders, add test where no tokenizer is defined --- .../src/tests/test_chat_completions.py | 54 +++++++++++-------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/qa/L0_openai/example/src/tests/test_chat_completions.py b/qa/L0_openai/example/src/tests/test_chat_completions.py index 493bd5eafd..4434a776a1 100644 --- a/qa/L0_openai/example/src/tests/test_chat_completions.py +++ b/qa/L0_openai/example/src/tests/test_chat_completions.py @@ -51,28 +51,6 @@ def setup_app(self, tokenizer: str, model_repository: str): app = init_app() return app - # A TOKENIZER must be known for /chat/completions endpoint in order to - # apply chat templates, and for simplicity in determination, users should - # define the TOKENIZER. So, explicitly raise an error if none is provided. - def test_chat_completions_no_tokenizer(self): - model_repository = str(Path(__file__).parent / f"{TEST_BACKEND}_models") - app = self.setup_app(tokenizer="", model_repository=model_repository) - with TestClient(app) as client: - response = client.post( - "/v1/chat/completions", - json={"model": TEST_MODEL, "messages": TEST_MESSAGES}, - ) - assert response.status_code == 400 - assert response.json()["detail"] == "Unknown tokenizer" - - @pytest.mark.skip(reason="Not Implemented Yet") - def test_chat_completions_streaming(self, client): - pass - - @pytest.mark.skip(reason="Not Implemented Yet") - def test_chat_completions_no_streaming(self, client): - pass - def test_chat_completions_defaults(self, client): response = client.post( "/v1/chat/completions", @@ -426,6 +404,14 @@ def test_chat_completions_multiple_choices(self, client): assert response.status_code == 400 assert response.json()["detail"] == "Only single choice is supported" + @pytest.mark.skip(reason="Not Implemented Yet") + def test_chat_completions_streaming(self, client): + pass + + @pytest.mark.skip(reason="Not Implemented Yet") + def test_chat_completions_no_streaming(self, client): + pass + @pytest.mark.skip(reason="Not Implemented Yet") def test_function_calling(self): pass @@ -442,3 +428,27 @@ def test_multi_lora(self): @pytest.mark.skip(reason="Not Implemented Yet") def test_usage_response(self): pass + + +# For tests that won't use the same pytest fixture for server startup across +# the whole class test suite. +class TestChatCompletionsNoFixture: + def setup_app(self, tokenizer: str, model_repository: str): + os.environ["TOKENIZER"] = tokenizer + os.environ["TRITON_MODEL_REPOSITORY"] = model_repository + app = init_app() + return app + + # A TOKENIZER must be known for /chat/completions endpoint in order to + # apply chat templates, and for simplicity in determination, users should + # define the TOKENIZER. So, explicitly raise an error if none is provided. + def test_chat_completions_no_tokenizer(self): + model_repository = str(Path(__file__).parent / f"{TEST_BACKEND}_models") + app = self.setup_app(tokenizer="", model_repository=model_repository) + with TestClient(app) as client: + response = client.post( + "/v1/chat/completions", + json={"model": TEST_MODEL, "messages": TEST_MESSAGES}, + ) + assert response.status_code == 400 + assert response.json()["detail"] == "Unknown tokenizer" From 6e1bfafa20f63f7d65b23155f86cd44d8ad3cbce Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Thu, 15 Aug 2024 18:35:47 -0700 Subject: [PATCH 15/80] Add OpenAI Python Client tests, add streaming chat completions test, add OpenAIServer utility for testing without FastAPI TestClient, rename folder from example to openai for clarity that the source code isn't an example, add some usage examples with curl, genai-perf, and openai client, add --tokenizer to main.py --- .../example/src/tests/test_openai_client.py | 84 -------- qa/L0_openai/examples/chat.sh | 7 + qa/L0_openai/examples/genai_perf.sh | 12 ++ qa/L0_openai/examples/models.sh | 3 + qa/L0_openai/examples/openai_client.py | 21 ++ qa/L0_openai/examples/streaming_curl.sh | 18 ++ qa/L0_openai/{example => openai}/Dockerfile | 3 +- qa/L0_openai/{example => openai}/README.md | 0 qa/L0_openai/{example => openai}/main.py | 13 +- .../{example => openai}/src/__init__.py | 0 .../{example => openai}/src/api_server.py | 0 .../src/routers/__init__.py | 0 .../src/routers/chat_completions.py | 24 ++- .../src/routers/completions.py | 0 .../{example => openai}/src/routers/models.py | 3 +- .../src/routers/observability.py | 0 .../src/schemas/__init__.py | 0 .../{example => openai}/src/schemas/openai.py | 0 .../{example => openai}/src/tests/__init__.py | 0 .../tests/tensorrtllm_models/ensemble/1/.tmp | 0 .../tensorrtllm_models/ensemble/config.pbtxt | 0 .../postprocessing/1/model.py | 0 .../postprocessing/config.pbtxt | 0 .../preprocessing/1/model.py | 0 .../preprocessing/config.pbtxt | 0 .../tensorrt_llm/1/.gitkeep | 0 .../tensorrt_llm/1/model.py | 0 .../tensorrt_llm/config.pbtxt | 0 .../tensorrt_llm_bls/1/lib/decode.py | 0 .../tensorrt_llm_bls/1/lib/triton_decoder.py | 0 .../tensorrt_llm_bls/1/model.py | 0 .../tensorrt_llm_bls/config.pbtxt | 0 .../src/tests/test_chat_completions.py | 29 ++- .../src/tests/test_completions.py | 0 .../src/tests/test_models/mock_llm/1/model.py | 0 .../tests/test_models/mock_llm/config.pbtxt | 0 .../src/tests/test_observability.py | 0 .../openai/src/tests/test_openai_client.py | 189 ++++++++++++++++++ qa/L0_openai/openai/src/tests/utils.py | 84 ++++++++ .../llama-3-8b-instruct/1/model.json | 0 .../llama-3-8b-instruct/config.pbtxt | 0 .../{example => openai}/src/utils/__init__.py | 0 .../src/utils/tokenizer.py | 0 .../{example => openai}/src/utils/triton.py | 6 +- 44 files changed, 391 insertions(+), 105 deletions(-) delete mode 100644 qa/L0_openai/example/src/tests/test_openai_client.py create mode 100755 qa/L0_openai/examples/chat.sh create mode 100755 qa/L0_openai/examples/genai_perf.sh create mode 100755 qa/L0_openai/examples/models.sh create mode 100755 qa/L0_openai/examples/openai_client.py create mode 100755 qa/L0_openai/examples/streaming_curl.sh rename qa/L0_openai/{example => openai}/Dockerfile (76%) rename qa/L0_openai/{example => openai}/README.md (100%) rename qa/L0_openai/{example => openai}/main.py (81%) mode change 100644 => 100755 rename qa/L0_openai/{example => openai}/src/__init__.py (100%) rename qa/L0_openai/{example => openai}/src/api_server.py (100%) rename qa/L0_openai/{example => openai}/src/routers/__init__.py (100%) rename qa/L0_openai/{example => openai}/src/routers/chat_completions.py (89%) rename qa/L0_openai/{example => openai}/src/routers/completions.py (100%) rename qa/L0_openai/{example => openai}/src/routers/models.py (97%) rename qa/L0_openai/{example => openai}/src/routers/observability.py (100%) rename qa/L0_openai/{example => openai}/src/schemas/__init__.py (100%) rename qa/L0_openai/{example => openai}/src/schemas/openai.py (100%) rename qa/L0_openai/{example => openai}/src/tests/__init__.py (100%) rename qa/L0_openai/{example => openai}/src/tests/tensorrtllm_models/ensemble/1/.tmp (100%) rename qa/L0_openai/{example => openai}/src/tests/tensorrtllm_models/ensemble/config.pbtxt (100%) rename qa/L0_openai/{example => openai}/src/tests/tensorrtllm_models/postprocessing/1/model.py (100%) rename qa/L0_openai/{example => openai}/src/tests/tensorrtllm_models/postprocessing/config.pbtxt (100%) rename qa/L0_openai/{example => openai}/src/tests/tensorrtllm_models/preprocessing/1/model.py (100%) rename qa/L0_openai/{example => openai}/src/tests/tensorrtllm_models/preprocessing/config.pbtxt (100%) rename qa/L0_openai/{example => openai}/src/tests/tensorrtllm_models/tensorrt_llm/1/.gitkeep (100%) rename qa/L0_openai/{example => openai}/src/tests/tensorrtllm_models/tensorrt_llm/1/model.py (100%) rename qa/L0_openai/{example => openai}/src/tests/tensorrtllm_models/tensorrt_llm/config.pbtxt (100%) rename qa/L0_openai/{example => openai}/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/decode.py (100%) rename qa/L0_openai/{example => openai}/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/triton_decoder.py (100%) rename qa/L0_openai/{example => openai}/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/model.py (100%) rename qa/L0_openai/{example => openai}/src/tests/tensorrtllm_models/tensorrt_llm_bls/config.pbtxt (100%) rename qa/L0_openai/{example => openai}/src/tests/test_chat_completions.py (95%) rename qa/L0_openai/{example => openai}/src/tests/test_completions.py (100%) rename qa/L0_openai/{example => openai}/src/tests/test_models/mock_llm/1/model.py (100%) rename qa/L0_openai/{example => openai}/src/tests/test_models/mock_llm/config.pbtxt (100%) rename qa/L0_openai/{example => openai}/src/tests/test_observability.py (100%) create mode 100644 qa/L0_openai/openai/src/tests/test_openai_client.py create mode 100644 qa/L0_openai/openai/src/tests/utils.py rename qa/L0_openai/{example => openai}/src/tests/vllm_models/llama-3-8b-instruct/1/model.json (100%) rename qa/L0_openai/{example => openai}/src/tests/vllm_models/llama-3-8b-instruct/config.pbtxt (100%) rename qa/L0_openai/{example => openai}/src/utils/__init__.py (100%) rename qa/L0_openai/{example => openai}/src/utils/tokenizer.py (100%) rename qa/L0_openai/{example => openai}/src/utils/triton.py (98%) diff --git a/qa/L0_openai/example/src/tests/test_openai_client.py b/qa/L0_openai/example/src/tests/test_openai_client.py deleted file mode 100644 index c83d69a9b1..0000000000 --- a/qa/L0_openai/example/src/tests/test_openai_client.py +++ /dev/null @@ -1,84 +0,0 @@ -import os -from pathlib import Path - -import pytest -from openai import OpenAI - - -class TestOpenAIClient: - # Start server, then with scope="class", pass execution back to each test - # until all tests in the class have been run, then clean up. - # TODO: OpenAI client requires server is already running - @pytest.fixture(scope="class", autouse=True) - def start_server(self): - model_repository = Path(__file__).parent / "vllm_models" - os.environ["TRITON_MODEL_REPOSITORY"] = str(model_repository) - - # TODO: Start server in background - # ex: https://github.com/vllm-project/vllm/blob/main/tests/utils.py - # proc = subprocess.run(...) - yield - # proc.terminate() - # proc.wait() - # proc.kill() - - @pytest.mark.skip(reason="Not Implemented Yet") - def test_openai_client_completion(self): - openai_api_key = "EMPTY" - openai_api_base = "http://localhost:8000/v1" - - client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, - ) - - models = client.models.list() - print(f"Models: {models}") - model = models.data[0].id - print(f"Model: {model}") - - completion = client.completions.create( - prompt="Hi there", - model=model, - ) - - assert completion - print(f"Completion results: {completion}") - - @pytest.mark.skip(reason="Not Implemented Yet") - def test_openai_client_chat_completion(self): - openai_api_key = "EMPTY" - openai_api_base = "http://localhost:8000/v1" - - client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, - ) - - models = client.models.list() - print(f"Models: {models}") - model = models.data[0].id - print(f"Model: {model}") - - chat_completion = client.chat.completions.create( - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "Who won the world series in 2020?"}, - { - "role": "assistant", - "content": "The Los Angeles Dodgers won the World Series in 2020.", - }, - {"role": "user", "content": "Where was it played?"}, - ], - model=model, - ) - - assert chat_completion - assert chat_completion.choices - assert chat_completion.choices[0] - assert chat_completion.choices[0].finish_reason == "stop" - print(f"Chat completion results: {chat_completion}") - - @pytest.mark.skip(reason="Not Implemented Yet") - def test_openai_client_function_calling(self): - pass diff --git a/qa/L0_openai/examples/chat.sh b/qa/L0_openai/examples/chat.sh new file mode 100755 index 0000000000..a8f3d6d8ef --- /dev/null +++ b/qa/L0_openai/examples/chat.sh @@ -0,0 +1,7 @@ +#!/bin/bash +curl -s http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "tensorrt_llm_bls", + "messages": [{"role": "user", "content": "Say this is a test!"}] + }' | jq diff --git a/qa/L0_openai/examples/genai_perf.sh b/qa/L0_openai/examples/genai_perf.sh new file mode 100755 index 0000000000..1e3f44edcf --- /dev/null +++ b/qa/L0_openai/examples/genai_perf.sh @@ -0,0 +1,12 @@ +#!/bin/bash +genai-perf \ + --model tensorrt_llm_bls \ + --tokenizer meta-llama/Meta-Llama-3-8B-Instruct \ + --service-kind openai \ + --endpoint-type chat \ + --synthetic-input-tokens-mean 256 \ + --synthetic-input-tokens-stddev 0 \ + --output-tokens-mean 256 \ + --output-tokens-stddev 0 \ + # --streaming + # --extra-inputs stream:true diff --git a/qa/L0_openai/examples/models.sh b/qa/L0_openai/examples/models.sh new file mode 100755 index 0000000000..944fbe07af --- /dev/null +++ b/qa/L0_openai/examples/models.sh @@ -0,0 +1,3 @@ +#!/bin/bash +curl -s http://localhost:8000/v1/models \ + -H "Content-Type: application/json" | jq diff --git a/qa/L0_openai/examples/openai_client.py b/qa/L0_openai/examples/openai_client.py new file mode 100755 index 0000000000..0b7184af22 --- /dev/null +++ b/qa/L0_openai/examples/openai_client.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 +from openai import OpenAI + +client = OpenAI( + base_url="http://localhost:8000/v1", + api_key="EMPTY", +) + +completion = client.chat.completions.create( + model="tensorrt_llm_bls", + messages=[ + { + "role": "system", + "content": "You are a helpful assistant.", + }, + {"role": "user", "content": "What are LLMs?"}, + ], + max_tokens=256, +) + +print(completion.choices[0].message.content) diff --git a/qa/L0_openai/examples/streaming_curl.sh b/qa/L0_openai/examples/streaming_curl.sh new file mode 100755 index 0000000000..33bebe253e --- /dev/null +++ b/qa/L0_openai/examples/streaming_curl.sh @@ -0,0 +1,18 @@ +#!/bin/bash +MODEL=${1:-"tensorrt_llm_bls"} +curl -s -N http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "'${MODEL}'", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Hello!" + } + ], + "stream": true + }' diff --git a/qa/L0_openai/example/Dockerfile b/qa/L0_openai/openai/Dockerfile similarity index 76% rename from qa/L0_openai/example/Dockerfile rename to qa/L0_openai/openai/Dockerfile index 64d5637432..c053959a75 100644 --- a/qa/L0_openai/example/Dockerfile +++ b/qa/L0_openai/openai/Dockerfile @@ -1,4 +1,5 @@ ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.07-vllm-python-py3 FROM ${BASE_IMAGE} RUN pip install /opt/tritonserver/python/*.whl -RUN pip install "fastapi==0.111.1" "pytest==8.1.1" "openai==1.40.6" +# TODO: Add to requirements.txt +RUN pip install "fastapi==0.111.1" "pytest==8.1.1" "openai==1.40.6" "pytest-asyncio==0.23.8" diff --git a/qa/L0_openai/example/README.md b/qa/L0_openai/openai/README.md similarity index 100% rename from qa/L0_openai/example/README.md rename to qa/L0_openai/openai/README.md diff --git a/qa/L0_openai/example/main.py b/qa/L0_openai/openai/main.py old mode 100644 new mode 100755 similarity index 81% rename from qa/L0_openai/example/main.py rename to qa/L0_openai/openai/main.py index 81b4f8066a..4f6f11a9f9 --- a/qa/L0_openai/example/main.py +++ b/qa/L0_openai/openai/main.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import argparse import os @@ -29,12 +30,17 @@ def parse_args(): default=0, help="The tritonserver log verbosity level", ) - triton_group.add_argument( "--model-repository", type=str, default=None, - help="model repository", + help="Path to the Triton model repository holding the models to be served", + ) + triton_group.add_argument( + "--tokenizer", + type=str, + default=None, + help="HuggingFace ID of the Tokenizer to use for chat templates", ) return parser.parse_args() @@ -46,6 +52,9 @@ def parse_args(): # but use env vars for simplicity for now. if args.model_repository: os.environ["TRITON_MODEL_REPOSITORY"] = args.model_repository + if args.tokenizer: + os.environ["TOKENIZER"] = args.tokenizer + os.environ["TRITON_LOG_VERBOSE_LEVEL"] = str(args.tritonserver_log_level) app = init_app() diff --git a/qa/L0_openai/example/src/__init__.py b/qa/L0_openai/openai/src/__init__.py similarity index 100% rename from qa/L0_openai/example/src/__init__.py rename to qa/L0_openai/openai/src/__init__.py diff --git a/qa/L0_openai/example/src/api_server.py b/qa/L0_openai/openai/src/api_server.py similarity index 100% rename from qa/L0_openai/example/src/api_server.py rename to qa/L0_openai/openai/src/api_server.py diff --git a/qa/L0_openai/example/src/routers/__init__.py b/qa/L0_openai/openai/src/routers/__init__.py similarity index 100% rename from qa/L0_openai/example/src/routers/__init__.py rename to qa/L0_openai/openai/src/routers/__init__.py diff --git a/qa/L0_openai/example/src/routers/chat_completions.py b/qa/L0_openai/openai/src/routers/chat_completions.py similarity index 89% rename from qa/L0_openai/example/src/routers/chat_completions.py rename to qa/L0_openai/openai/src/routers/chat_completions.py index 189836ad5b..8abe5afdee 100644 --- a/qa/L0_openai/example/src/routers/chat_completions.py +++ b/qa/L0_openai/openai/src/routers/chat_completions.py @@ -19,12 +19,19 @@ router = APIRouter() +def get_first_response_role(conversation, add_generation_prompt, default_role): + if add_generation_prompt: + return default_role + + return conversation[-1]["role"] + + def streaming_chat_completion_response(request_id, created, model, role, responses): # first chunk choice = ChatCompletionStreamingResponseChoice( index=0, delta=ChatCompletionStreamResponseDelta( - role=role, content=None, function_call=None + role=role, content="", function_call=None ), logprobs=None, finish_reason=None, @@ -99,9 +106,6 @@ def create_chat_completion( if not metadata.backend: raise HTTPException(status_code=400, detail="Unknown backend") - add_generation_prompt_default = True - default_role = "assistant" - triton_model = raw_request.app.server.model(request.model) if request.model != triton_model.name: raise HTTPException( @@ -122,12 +126,16 @@ def create_chat_completion( for message in request.messages ] - # TODO: Use HF tokenizer or use Jinja/templater directly? - # TODO: Function Calling / tools related to this? + # NOTE: This behavior should be tested further + # TODO: Do these need to be exposed to the user? + add_generation_prompt = True + default_role = "assistant" + role = get_first_response_role(conversation, add_generation_prompt, default_role) + prompt = metadata.tokenizer.apply_chat_template( conversation=conversation, tokenize=False, - add_generation_prompt=add_generation_prompt_default, + add_generation_prompt=add_generation_prompt, ) request_id = f"cmpl-{uuid.uuid1()}" @@ -140,7 +148,7 @@ def create_chat_completion( if request.stream: return StreamingResponse( streaming_chat_completion_response( - request_id, created, request.model, conversation[-1]["role"], responses + request_id, created, request.model, role, responses ) ) diff --git a/qa/L0_openai/example/src/routers/completions.py b/qa/L0_openai/openai/src/routers/completions.py similarity index 100% rename from qa/L0_openai/example/src/routers/completions.py rename to qa/L0_openai/openai/src/routers/completions.py diff --git a/qa/L0_openai/example/src/routers/models.py b/qa/L0_openai/openai/src/routers/models.py similarity index 97% rename from qa/L0_openai/example/src/routers/models.py rename to qa/L0_openai/openai/src/routers/models.py index 6798a52289..ff47000cfd 100644 --- a/qa/L0_openai/example/src/routers/models.py +++ b/qa/L0_openai/openai/src/routers/models.py @@ -3,8 +3,7 @@ router = APIRouter() -# TODO: What is this for? -OWNED_BY = "ACME" +OWNED_BY = "Triton Inference Server" @router.get("/v1/models", response_model=ListModelsResponse, tags=["Models"]) diff --git a/qa/L0_openai/example/src/routers/observability.py b/qa/L0_openai/openai/src/routers/observability.py similarity index 100% rename from qa/L0_openai/example/src/routers/observability.py rename to qa/L0_openai/openai/src/routers/observability.py diff --git a/qa/L0_openai/example/src/schemas/__init__.py b/qa/L0_openai/openai/src/schemas/__init__.py similarity index 100% rename from qa/L0_openai/example/src/schemas/__init__.py rename to qa/L0_openai/openai/src/schemas/__init__.py diff --git a/qa/L0_openai/example/src/schemas/openai.py b/qa/L0_openai/openai/src/schemas/openai.py similarity index 100% rename from qa/L0_openai/example/src/schemas/openai.py rename to qa/L0_openai/openai/src/schemas/openai.py diff --git a/qa/L0_openai/example/src/tests/__init__.py b/qa/L0_openai/openai/src/tests/__init__.py similarity index 100% rename from qa/L0_openai/example/src/tests/__init__.py rename to qa/L0_openai/openai/src/tests/__init__.py diff --git a/qa/L0_openai/example/src/tests/tensorrtllm_models/ensemble/1/.tmp b/qa/L0_openai/openai/src/tests/tensorrtllm_models/ensemble/1/.tmp similarity index 100% rename from qa/L0_openai/example/src/tests/tensorrtllm_models/ensemble/1/.tmp rename to qa/L0_openai/openai/src/tests/tensorrtllm_models/ensemble/1/.tmp diff --git a/qa/L0_openai/example/src/tests/tensorrtllm_models/ensemble/config.pbtxt b/qa/L0_openai/openai/src/tests/tensorrtllm_models/ensemble/config.pbtxt similarity index 100% rename from qa/L0_openai/example/src/tests/tensorrtllm_models/ensemble/config.pbtxt rename to qa/L0_openai/openai/src/tests/tensorrtllm_models/ensemble/config.pbtxt diff --git a/qa/L0_openai/example/src/tests/tensorrtllm_models/postprocessing/1/model.py b/qa/L0_openai/openai/src/tests/tensorrtllm_models/postprocessing/1/model.py similarity index 100% rename from qa/L0_openai/example/src/tests/tensorrtllm_models/postprocessing/1/model.py rename to qa/L0_openai/openai/src/tests/tensorrtllm_models/postprocessing/1/model.py diff --git a/qa/L0_openai/example/src/tests/tensorrtllm_models/postprocessing/config.pbtxt b/qa/L0_openai/openai/src/tests/tensorrtllm_models/postprocessing/config.pbtxt similarity index 100% rename from qa/L0_openai/example/src/tests/tensorrtllm_models/postprocessing/config.pbtxt rename to qa/L0_openai/openai/src/tests/tensorrtllm_models/postprocessing/config.pbtxt diff --git a/qa/L0_openai/example/src/tests/tensorrtllm_models/preprocessing/1/model.py b/qa/L0_openai/openai/src/tests/tensorrtllm_models/preprocessing/1/model.py similarity index 100% rename from qa/L0_openai/example/src/tests/tensorrtllm_models/preprocessing/1/model.py rename to qa/L0_openai/openai/src/tests/tensorrtllm_models/preprocessing/1/model.py diff --git a/qa/L0_openai/example/src/tests/tensorrtllm_models/preprocessing/config.pbtxt b/qa/L0_openai/openai/src/tests/tensorrtllm_models/preprocessing/config.pbtxt similarity index 100% rename from qa/L0_openai/example/src/tests/tensorrtllm_models/preprocessing/config.pbtxt rename to qa/L0_openai/openai/src/tests/tensorrtllm_models/preprocessing/config.pbtxt diff --git a/qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm/1/.gitkeep b/qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm/1/.gitkeep similarity index 100% rename from qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm/1/.gitkeep rename to qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm/1/.gitkeep diff --git a/qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm/1/model.py b/qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm/1/model.py similarity index 100% rename from qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm/1/model.py rename to qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm/1/model.py diff --git a/qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm/config.pbtxt b/qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm/config.pbtxt similarity index 100% rename from qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm/config.pbtxt rename to qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm/config.pbtxt diff --git a/qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/decode.py b/qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/decode.py similarity index 100% rename from qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/decode.py rename to qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/decode.py diff --git a/qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/triton_decoder.py b/qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/triton_decoder.py similarity index 100% rename from qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/triton_decoder.py rename to qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/triton_decoder.py diff --git a/qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/model.py b/qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/model.py similarity index 100% rename from qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/model.py rename to qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/model.py diff --git a/qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm_bls/config.pbtxt b/qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm_bls/config.pbtxt similarity index 100% rename from qa/L0_openai/example/src/tests/tensorrtllm_models/tensorrt_llm_bls/config.pbtxt rename to qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm_bls/config.pbtxt diff --git a/qa/L0_openai/example/src/tests/test_chat_completions.py b/qa/L0_openai/openai/src/tests/test_chat_completions.py similarity index 95% rename from qa/L0_openai/example/src/tests/test_chat_completions.py rename to qa/L0_openai/openai/src/tests/test_chat_completions.py index 4434a776a1..28572a4acd 100644 --- a/qa/L0_openai/example/src/tests/test_chat_completions.py +++ b/qa/L0_openai/openai/src/tests/test_chat_completions.py @@ -36,7 +36,7 @@ class TestChatCompletions: # TODO: Consider module/package scope, or join Completions tests into same file # to run server only once for both sets of tests for faster iteration. - @pytest.fixture(scope="class", autouse=False) + @pytest.fixture(scope="class") def client(self): model_repository = str(Path(__file__).parent / f"{TEST_BACKEND}_models") app = self.setup_app( @@ -323,7 +323,7 @@ def test_chat_completions_temperature_tensorrtllm(self, client): assert response1_text == response2_text assert response1_text != response3_text - # Simple tests to verify seed roughly behaves as expected + # Simple tests to verify random seed roughly behaves as expected def test_chat_completions_seed(self, client): responses = [] payload1 = { @@ -408,9 +408,16 @@ def test_chat_completions_multiple_choices(self, client): def test_chat_completions_streaming(self, client): pass - @pytest.mark.skip(reason="Not Implemented Yet") def test_chat_completions_no_streaming(self, client): - pass + response = client.post( + "/v1/chat/completions", + json={"model": TEST_MODEL, "messages": TEST_MESSAGES, "stream": False}, + ) + + assert response.status_code == 200 + message = response.json()["choices"][0]["message"] + assert message["content"].strip() + assert message["role"] == "assistant" @pytest.mark.skip(reason="Not Implemented Yet") def test_function_calling(self): @@ -424,6 +431,18 @@ def test_lora(self): def test_multi_lora(self): pass + @pytest.mark.skip(reason="Not Implemented Yet") + def test_request_n_choices(self): + pass + + @pytest.mark.skip(reason="Not Implemented Yet") + def test_request_logprobs(self): + pass + + @pytest.mark.skip(reason="Not Implemented Yet") + def test_request_logit_bias(self): + pass + # TODO: Do we want to support "usage" field for token counts in response? @pytest.mark.skip(reason="Not Implemented Yet") def test_usage_response(self): @@ -432,7 +451,7 @@ def test_usage_response(self): # For tests that won't use the same pytest fixture for server startup across # the whole class test suite. -class TestChatCompletionsNoFixture: +class TestChatCompletionsCustomFixture: def setup_app(self, tokenizer: str, model_repository: str): os.environ["TOKENIZER"] = tokenizer os.environ["TRITON_MODEL_REPOSITORY"] = model_repository diff --git a/qa/L0_openai/example/src/tests/test_completions.py b/qa/L0_openai/openai/src/tests/test_completions.py similarity index 100% rename from qa/L0_openai/example/src/tests/test_completions.py rename to qa/L0_openai/openai/src/tests/test_completions.py diff --git a/qa/L0_openai/example/src/tests/test_models/mock_llm/1/model.py b/qa/L0_openai/openai/src/tests/test_models/mock_llm/1/model.py similarity index 100% rename from qa/L0_openai/example/src/tests/test_models/mock_llm/1/model.py rename to qa/L0_openai/openai/src/tests/test_models/mock_llm/1/model.py diff --git a/qa/L0_openai/example/src/tests/test_models/mock_llm/config.pbtxt b/qa/L0_openai/openai/src/tests/test_models/mock_llm/config.pbtxt similarity index 100% rename from qa/L0_openai/example/src/tests/test_models/mock_llm/config.pbtxt rename to qa/L0_openai/openai/src/tests/test_models/mock_llm/config.pbtxt diff --git a/qa/L0_openai/example/src/tests/test_observability.py b/qa/L0_openai/openai/src/tests/test_observability.py similarity index 100% rename from qa/L0_openai/example/src/tests/test_observability.py rename to qa/L0_openai/openai/src/tests/test_observability.py diff --git a/qa/L0_openai/openai/src/tests/test_openai_client.py b/qa/L0_openai/openai/src/tests/test_openai_client.py new file mode 100644 index 0000000000..349d8c9cf1 --- /dev/null +++ b/qa/L0_openai/openai/src/tests/test_openai_client.py @@ -0,0 +1,189 @@ +from pathlib import Path + +import openai +import pytest +from src.tests.utils import OpenAIServer + +### TEST ENVIRONMENT SETUP ### +TEST_BACKEND = "" +TEST_MODEL = "" +TEST_PROMPT = "What is machine learning?" +TEST_MESSAGES = [{"role": "user", "content": TEST_PROMPT}] +TEST_TOKENIZER = "meta-llama/Meta-Llama-3-8B-Instruct" +try: + import vllm as _ + + TEST_BACKEND = "vllm" + TEST_MODEL = "llama-3-8b-instruct" +except ImportError: + pass + +try: + import tensorrt_llm as _ + + TEST_BACKEND = "tensorrtllm" + TEST_MODEL = "tensorrt_llm_bls" +except ImportError: + pass + +if not TEST_BACKEND or not TEST_MODEL: + raise Exception("Unknown test environment") +### + + +# NOTE: OpenAI client requires actual server running, and won't work +# with the FastAPI TestClient. Run the server at module scope to run +# only once for all the tests below. +@pytest.fixture(scope="module") +def server(): + model_repository = Path(__file__).parent / f"{TEST_BACKEND}_models" + tokenizer = "meta-llama/Meta-Llama-3-8B-Instruct" + args = ["--model-repository", model_repository, "--tokenizer", tokenizer] + + with OpenAIServer(args) as openai_server: + yield openai_server + + +class TestOpenAIClient: + @pytest.fixture(scope="class") + def client(self, server): + return server.get_client() + + def test_openai_client_models(self, client: openai.OpenAI): + models = list(client.models.list()) + print(f"Models: {models}") + if TEST_BACKEND == "tensorrtllm": + # ensemble or tensorrt_llm_bls + # preprocess -> tensorrt_llm -> postprocess + assert len(models) == 5 + elif TEST_BACKEND == "vllm": + assert len(models) == 1 + else: + raise Exception(f"Unexpected backend {TEST_BACKEND=}") + + def test_openai_client_completion(self, client: openai.OpenAI): + completion = client.completions.create( + prompt=TEST_PROMPT, + model=TEST_MODEL, + ) + + assert completion + print(f"Completion results: {completion}") + + def test_openai_client_chat_completion(self, client: openai.OpenAI): + chat_completion = client.chat.completions.create( + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Who won the world series in 2020?"}, + { + "role": "assistant", + "content": "The Los Angeles Dodgers won the World Series in 2020.", + }, + {"role": "user", "content": "Where was it played?"}, + ], + model=TEST_MODEL, + ) + + assert chat_completion.choices[0].message.content + assert chat_completion.choices[0].finish_reason == "stop" + print(f"Chat completion results: {chat_completion}") + + @pytest.mark.skip(reason="Not Implemented Yet") + def test_openai_client_function_calling(self): + pass + + +class TestAsyncOpenAIClient: + @pytest.fixture(scope="class") + def client(self, server): + return server.get_async_client() + + @pytest.mark.asyncio + async def test_openai_client_models(self, client: openai.AsyncOpenAI): + async_models = await client.models.list() + models = [model async for model in async_models] + print(f"Models: {models}") + if TEST_BACKEND == "tensorrtllm": + # ensemble or tensorrt_llm_bls + # preprocess -> tensorrt_llm -> postprocess + assert len(models) == 5 + elif TEST_BACKEND == "vllm": + assert len(models) == 1 + else: + raise Exception(f"Unexpected backend {TEST_BACKEND=}") + + @pytest.mark.asyncio + async def test_openai_client_completion(self, client: openai.AsyncOpenAI): + completion = await client.completions.create( + prompt=TEST_PROMPT, + model=TEST_MODEL, + ) + + assert completion + print(f"Completion results: {completion}") + + @pytest.mark.asyncio + async def test_openai_client_chat_completion(self, client: openai.AsyncOpenAI): + chat_completion = await client.chat.completions.create( + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Who won the world series in 2020?"}, + { + "role": "assistant", + "content": "The Los Angeles Dodgers won the World Series in 2020.", + }, + {"role": "user", "content": "Where was it played?"}, + ], + model=TEST_MODEL, + ) + + assert chat_completion.choices[0].message.content + assert chat_completion.choices[0].finish_reason == "stop" + print(f"Chat completion results: {chat_completion}") + + @pytest.mark.asyncio + async def test_chat_streaming(self, client: openai.AsyncOpenAI): + messages = [ + {"role": "system", "content": "you are a helpful assistant"}, + {"role": "user", "content": "what is 1+1?"}, + ] + + # test single completion + chat_completion = await client.chat.completions.create( + model=TEST_MODEL, + messages=messages, + max_tokens=10, + temperature=0.0, + stream=False, + ) + output = chat_completion.choices[0].message.content + stop_reason = chat_completion.choices[0].finish_reason + + # test streaming + stream = await client.chat.completions.create( + model=TEST_MODEL, + messages=messages, + max_tokens=10, + temperature=0.0, + stream=True, + ) + chunks = [] + finish_reason_count = 0 + async for chunk in stream: + delta = chunk.choices[0].delta + print("[DEBUG] DELTA:", delta) + if delta.role: + assert delta.role == "assistant" + if delta.content: + chunks.append(delta.content) + if chunk.choices[0].finish_reason is not None: + finish_reason_count += 1 + # finish reason should only return in last block + assert finish_reason_count == 1 + assert chunk.choices[0].finish_reason == stop_reason + assert "".join(chunks) == output + + @pytest.mark.skip(reason="Not Implemented Yet") + @pytest.mark.asyncio + async def test_openai_client_function_calling(self): + pass diff --git a/qa/L0_openai/openai/src/tests/utils.py b/qa/L0_openai/openai/src/tests/utils.py new file mode 100644 index 0000000000..b9ef2dcb5f --- /dev/null +++ b/qa/L0_openai/openai/src/tests/utils.py @@ -0,0 +1,84 @@ +import os +import subprocess +import sys +import time +from pathlib import Path +from typing import Dict, List, Optional + +import openai +import requests + + +# Heavily inspired by vLLM's test infrastructure +class OpenAIServer: + API_KEY = "EMPTY" # Triton's OpenAI server does not need API key + START_TIMEOUT = 120 # wait for server to start for up to 120 seconds + + def __init__( + self, + cli_args: List[str], + *, + env_dict: Optional[Dict[str, str]] = None, + ) -> None: + self.host = "localhost" + self.port = 8000 + + env = os.environ.copy() + if env_dict is not None: + env.update(env_dict) + + this_dir = Path(__file__).resolve().parent + script_path = this_dir / ".." / ".." / "main.py" + self.proc = subprocess.Popen( + ["python3", script_path] + cli_args, + env=env, + stdout=sys.stdout, + stderr=sys.stderr, + ) + # Wait until health endpoint is responsive + self._wait_for_server(url=self.url_for("health"), timeout=self.START_TIMEOUT) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.proc.terminate() + try: + self.proc.wait(3) + except subprocess.TimeoutExpired: + # force kill if needed + self.proc.kill() + + def _wait_for_server(self, *, url: str, timeout: float): + start = time.time() + while True: + try: + if requests.get(url).status_code == 200: + break + except Exception as err: + result = self.proc.poll() + if result is not None and result != 0: + raise RuntimeError("Server exited unexpectedly.") from err + + time.sleep(0.5) + if time.time() - start > timeout: + raise RuntimeError("Server failed to start in time.") from err + + @property + def url_root(self) -> str: + return f"http://{self.host}:{self.port}" + + def url_for(self, *parts: str) -> str: + return self.url_root + "/" + "/".join(parts) + + def get_client(self): + return openai.OpenAI( + base_url=self.url_for("v1"), + api_key=self.API_KEY, + ) + + def get_async_client(self): + return openai.AsyncOpenAI( + base_url=self.url_for("v1"), + api_key=self.API_KEY, + ) diff --git a/qa/L0_openai/example/src/tests/vllm_models/llama-3-8b-instruct/1/model.json b/qa/L0_openai/openai/src/tests/vllm_models/llama-3-8b-instruct/1/model.json similarity index 100% rename from qa/L0_openai/example/src/tests/vllm_models/llama-3-8b-instruct/1/model.json rename to qa/L0_openai/openai/src/tests/vllm_models/llama-3-8b-instruct/1/model.json diff --git a/qa/L0_openai/example/src/tests/vllm_models/llama-3-8b-instruct/config.pbtxt b/qa/L0_openai/openai/src/tests/vllm_models/llama-3-8b-instruct/config.pbtxt similarity index 100% rename from qa/L0_openai/example/src/tests/vllm_models/llama-3-8b-instruct/config.pbtxt rename to qa/L0_openai/openai/src/tests/vllm_models/llama-3-8b-instruct/config.pbtxt diff --git a/qa/L0_openai/example/src/utils/__init__.py b/qa/L0_openai/openai/src/utils/__init__.py similarity index 100% rename from qa/L0_openai/example/src/utils/__init__.py rename to qa/L0_openai/openai/src/utils/__init__.py diff --git a/qa/L0_openai/example/src/utils/tokenizer.py b/qa/L0_openai/openai/src/utils/tokenizer.py similarity index 100% rename from qa/L0_openai/example/src/utils/tokenizer.py rename to qa/L0_openai/openai/src/utils/tokenizer.py diff --git a/qa/L0_openai/example/src/utils/triton.py b/qa/L0_openai/openai/src/utils/triton.py similarity index 98% rename from qa/L0_openai/example/src/utils/triton.py rename to qa/L0_openai/openai/src/utils/triton.py index 594eeed08b..a654efceb3 100644 --- a/qa/L0_openai/example/src/utils/triton.py +++ b/qa/L0_openai/openai/src/utils/triton.py @@ -94,7 +94,7 @@ def load_models(server): names.append(name) models.append(model) backends.append(backend) - print(f"Loaded: Model={name}, Backend={backend}.") + print(f"Loaded: {name=}, {backend=}, tokenizer={tokenizer_model}") create_time = int(time.time()) @@ -131,7 +131,7 @@ def init_tritonserver(): model_control_mode=tritonserver.ModelControlMode.EXPLICIT, ).start(wait_until_ready=True) - print("Loading Model...\n\n") + print("Loading Models...") metadatas = load_models(server) return server, metadatas @@ -142,7 +142,7 @@ def get_output(response): return response.outputs["text_output"].to_string_array()[0] except: return str(response.outputs["text_output"].to_bytes_array()[0]) - return None + return "" def create_vllm_inference_request( From 4e3a441bdf9f7b4e039dfdafb504b766d98ab9c6 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Thu, 15 Aug 2024 18:52:26 -0700 Subject: [PATCH 16/80] Add 'echo' parameter test, but skip it for TRT-LLm due to only supporting it at model load time. Cleanup completions tests --- .../openai/src/tests/test_openai_client.py | 26 ++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/qa/L0_openai/openai/src/tests/test_openai_client.py b/qa/L0_openai/openai/src/tests/test_openai_client.py index 349d8c9cf1..0405fbbfb7 100644 --- a/qa/L0_openai/openai/src/tests/test_openai_client.py +++ b/qa/L0_openai/openai/src/tests/test_openai_client.py @@ -67,8 +67,9 @@ def test_openai_client_completion(self, client: openai.OpenAI): model=TEST_MODEL, ) - assert completion print(f"Completion results: {completion}") + assert completion.choices[0].text + assert completion.choices[0].finish_reason == "stop" def test_openai_client_chat_completion(self, client: openai.OpenAI): chat_completion = client.chat.completions.create( @@ -84,9 +85,27 @@ def test_openai_client_chat_completion(self, client: openai.OpenAI): model=TEST_MODEL, ) + print(f"Chat completion results: {chat_completion}") assert chat_completion.choices[0].message.content assert chat_completion.choices[0].finish_reason == "stop" - print(f"Chat completion results: {chat_completion}") + + @pytest.mark.skipif( + TEST_BACKEND == "tensorrtllm", + reason="TRT-LLM backend currently only supports setting this parameter at model load time", + ) + @pytest.mark.parametrize("echo", [False, True]) + def test_openai_client_completion_echo(self, client: openai.OpenAI, echo: bool): + prompt = "What is the capital of France?" + completion = client.completions.create( + prompt=prompt, model=TEST_MODEL, echo=echo + ) + + print(f"Completion results: {completion}") + response = completion.choices[0].text + if echo: + assert prompt in response + else: + assert prompt not in response @pytest.mark.skip(reason="Not Implemented Yet") def test_openai_client_function_calling(self): @@ -119,8 +138,9 @@ async def test_openai_client_completion(self, client: openai.AsyncOpenAI): model=TEST_MODEL, ) - assert completion print(f"Completion results: {completion}") + assert completion.choices[0].text + assert completion.choices[0].finish_reason == "stop" @pytest.mark.asyncio async def test_openai_client_chat_completion(self, client: openai.AsyncOpenAI): From 523f3698caddccda0b2395424cffe4acf54eb661 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Thu, 15 Aug 2024 19:12:20 -0700 Subject: [PATCH 17/80] Fix issue with finish_reason for non-streaming completion when using a decoupled model with an empty final response, add response validation for this scenario --- .../openai/src/routers/chat_completions.py | 20 +++++++++++++++- .../openai/src/routers/completions.py | 24 +++++++++++++++++-- .../openai/src/tests/test_openai_client.py | 6 +++++ 3 files changed, 47 insertions(+), 3 deletions(-) diff --git a/qa/L0_openai/openai/src/routers/chat_completions.py b/qa/L0_openai/openai/src/routers/chat_completions.py index 8abe5afdee..45831e0c5a 100644 --- a/qa/L0_openai/openai/src/routers/chat_completions.py +++ b/qa/L0_openai/openai/src/routers/chat_completions.py @@ -152,8 +152,26 @@ def create_chat_completion( ) ) - response = list(responses)[0] + # Response validation with decoupled models in mind + responses = list(responses) + num_responses = len(responses) + if num_responses == 1 and responses[0].final != True: + raise HTTPException( + status_code=400, + detail="Unexpected internal error with incorrect response flags", + ) + if num_responses == 2 and responses[-1].final != True: + raise HTTPException( + status_code=400, + detail="Unexpected internal error with incorrect response flags", + ) + if num_responses > 2: + raise HTTPException( + status_code=400, + detail=f"Unexpected number of responses: {num_responses}, expected 1.", + ) + response = responses[0] text = get_output(response) return CreateChatCompletionResponse( diff --git a/qa/L0_openai/openai/src/routers/completions.py b/qa/L0_openai/openai/src/routers/completions.py index 71954574cd..76163c8667 100644 --- a/qa/L0_openai/openai/src/routers/completions.py +++ b/qa/L0_openai/openai/src/routers/completions.py @@ -99,11 +99,31 @@ def create_completion( return StreamingResponse( streaming_completion_response(request_id, created, metadata.name, responses) ) - response = list(responses)[0] + + # Response validation with decoupled models in mind + responses = list(responses) + num_responses = len(responses) + if num_responses == 1 and responses[0].final != True: + raise HTTPException( + status_code=400, + detail="Unexpected internal error with incorrect response flags", + ) + if num_responses == 2 and responses[-1].final != True: + raise HTTPException( + status_code=400, + detail="Unexpected internal error with incorrect response flags", + ) + if num_responses > 2: + raise HTTPException( + status_code=400, + detail=f"Unexpected number of responses: {num_responses}, expected 1.", + ) + + response = responses[0] text = get_output(response) choice = Choice( - finish_reason=FinishReason.stop if response.final else None, + finish_reason=FinishReason.stop, index=0, logprobs=None, text=text, diff --git a/qa/L0_openai/openai/src/tests/test_openai_client.py b/qa/L0_openai/openai/src/tests/test_openai_client.py index 0405fbbfb7..c6413f5614 100644 --- a/qa/L0_openai/openai/src/tests/test_openai_client.py +++ b/qa/L0_openai/openai/src/tests/test_openai_client.py @@ -161,6 +161,12 @@ async def test_openai_client_chat_completion(self, client: openai.AsyncOpenAI): assert chat_completion.choices[0].finish_reason == "stop" print(f"Chat completion results: {chat_completion}") + # TODO: Add this test + @pytest.mark.skip(reason="Not Implemented Yet") + @pytest.mark.asyncio + async def test_completion_streaming(self, client: openai.AsyncOpenAI): + pass + @pytest.mark.asyncio async def test_chat_streaming(self, client: openai.AsyncOpenAI): messages = [ From 75f71ce28c7a3b2423e48d28195463968c2d511f Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Thu, 15 Aug 2024 19:20:00 -0700 Subject: [PATCH 18/80] Move triton response validation into common triton utils --- .../openai/src/routers/chat_completions.py | 20 ++---------------- .../openai/src/routers/completions.py | 20 ++---------------- qa/L0_openai/openai/src/utils/triton.py | 21 ++++++++++++++++++- 3 files changed, 24 insertions(+), 37 deletions(-) diff --git a/qa/L0_openai/openai/src/routers/chat_completions.py b/qa/L0_openai/openai/src/routers/chat_completions.py index 45831e0c5a..762cb14df8 100644 --- a/qa/L0_openai/openai/src/routers/chat_completions.py +++ b/qa/L0_openai/openai/src/routers/chat_completions.py @@ -14,7 +14,7 @@ CreateChatCompletionStreamResponse, ObjectType, ) -from src.utils.triton import get_output +from src.utils.triton import get_output, validate_triton_responses router = APIRouter() @@ -154,23 +154,7 @@ def create_chat_completion( # Response validation with decoupled models in mind responses = list(responses) - num_responses = len(responses) - if num_responses == 1 and responses[0].final != True: - raise HTTPException( - status_code=400, - detail="Unexpected internal error with incorrect response flags", - ) - if num_responses == 2 and responses[-1].final != True: - raise HTTPException( - status_code=400, - detail="Unexpected internal error with incorrect response flags", - ) - if num_responses > 2: - raise HTTPException( - status_code=400, - detail=f"Unexpected number of responses: {num_responses}, expected 1.", - ) - + validate_triton_responses(responses) response = responses[0] text = get_output(response) diff --git a/qa/L0_openai/openai/src/routers/completions.py b/qa/L0_openai/openai/src/routers/completions.py index 76163c8667..92dee5807d 100644 --- a/qa/L0_openai/openai/src/routers/completions.py +++ b/qa/L0_openai/openai/src/routers/completions.py @@ -10,7 +10,7 @@ FinishReason, ObjectType, ) -from src.utils.triton import get_output +from src.utils.triton import get_output, validate_triton_responses router = APIRouter() @@ -102,23 +102,7 @@ def create_completion( # Response validation with decoupled models in mind responses = list(responses) - num_responses = len(responses) - if num_responses == 1 and responses[0].final != True: - raise HTTPException( - status_code=400, - detail="Unexpected internal error with incorrect response flags", - ) - if num_responses == 2 and responses[-1].final != True: - raise HTTPException( - status_code=400, - detail="Unexpected internal error with incorrect response flags", - ) - if num_responses > 2: - raise HTTPException( - status_code=400, - detail=f"Unexpected number of responses: {num_responses}, expected 1.", - ) - + validate_triton_responses(responses) response = responses[0] text = get_output(response) diff --git a/qa/L0_openai/openai/src/utils/triton.py b/qa/L0_openai/openai/src/utils/triton.py index a654efceb3..42a92fa34d 100644 --- a/qa/L0_openai/openai/src/utils/triton.py +++ b/qa/L0_openai/openai/src/utils/triton.py @@ -5,6 +5,7 @@ import numpy as np import tritonserver +from fastapi import HTTPException from src.schemas.openai import CreateChatCompletionRequest, CreateCompletionRequest from src.utils.tokenizer import get_tokenizer @@ -145,6 +146,25 @@ def get_output(response): return "" +def validate_triton_responses(responses): + num_responses = len(responses) + if num_responses == 1 and responses[0].final != True: + raise HTTPException( + status_code=400, + detail="Unexpected internal error with incorrect response flags", + ) + if num_responses == 2 and responses[-1].final != True: + raise HTTPException( + status_code=400, + detail="Unexpected internal error with incorrect response flags", + ) + if num_responses > 2: + raise HTTPException( + status_code=400, + detail=f"Unexpected number of responses: {num_responses}, expected 1.", + ) + + def create_vllm_inference_request( model, prompt, request: CreateChatCompletionRequest | CreateCompletionRequest ): @@ -171,7 +191,6 @@ def create_vllm_inference_request( return model.create_request(inputs=inputs, parameters=sampling_parameters) -# TODO: test def create_trtllm_inference_request( model, prompt, request: CreateChatCompletionRequest | CreateCompletionRequest ): From 118887cd051f6caf185b2fe16d688651a9467f71 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Fri, 16 Aug 2024 12:30:27 -0700 Subject: [PATCH 19/80] Reduce code copying and global variables, use conftest.py for shared fixtures and logic --- qa/L0_openai/openai/src/tests/conftest.py | 75 ++++++++ .../openai/src/tests/test_chat_completions.py | 168 ++++++++---------- .../openai/src/tests/test_completions.py | 114 +++++------- .../openai/src/tests/test_observability.py | 28 +-- .../openai/src/tests/test_openai_client.py | 140 +++++---------- qa/L0_openai/openai/src/tests/utils.py | 11 +- .../llama-3-8b-instruct/1/model.json | 1 - .../llama-3-8b-instruct/config.pbtxt | 2 - .../llama-3.1-8b-instruct/1/model.json | 1 + .../llama-3.1-8b-instruct/config.pbtxt | 2 + 10 files changed, 262 insertions(+), 280 deletions(-) create mode 100644 qa/L0_openai/openai/src/tests/conftest.py delete mode 100644 qa/L0_openai/openai/src/tests/vllm_models/llama-3-8b-instruct/1/model.json delete mode 100644 qa/L0_openai/openai/src/tests/vllm_models/llama-3-8b-instruct/config.pbtxt create mode 100644 qa/L0_openai/openai/src/tests/vllm_models/llama-3.1-8b-instruct/1/model.json create mode 100644 qa/L0_openai/openai/src/tests/vllm_models/llama-3.1-8b-instruct/config.pbtxt diff --git a/qa/L0_openai/openai/src/tests/conftest.py b/qa/L0_openai/openai/src/tests/conftest.py new file mode 100644 index 0000000000..2ff9697d59 --- /dev/null +++ b/qa/L0_openai/openai/src/tests/conftest.py @@ -0,0 +1,75 @@ +from pathlib import Path + +import pytest +from fastapi.testclient import TestClient +from src.tests.utils import OpenAIServer, setup_fastapi_app + +### TEST ENVIRONMENT SETUP ### +TEST_BACKEND = "" +TEST_MODEL = "" +TEST_PROMPT = "What is machine learning?" +TEST_MESSAGES = [{"role": "user", "content": TEST_PROMPT}] +TEST_TOKENIZER = "meta-llama/Meta-Llama-3.1-8B-Instruct" +try: + import vllm as _ + + TEST_BACKEND = "vllm" + TEST_MODEL = "llama-3.1-8b-instruct" +except ImportError: + pass + +try: + import tensorrt_llm as _ + + TEST_BACKEND = "tensorrtllm" + TEST_MODEL = "tensorrt_llm_bls" +except ImportError: + pass + +if not TEST_BACKEND or not TEST_MODEL: + raise Exception("Unknown test environment") +### + + +# NOTE: OpenAI client requires actual server running, and won't work +# with the FastAPI TestClient. Run the server at module scope to run +# only once for all the tests below. +@pytest.fixture(scope="module") +def server(): + model_repository = Path(__file__).parent / f"{TEST_BACKEND}_models" + args = ["--model-repository", model_repository, "--tokenizer", TEST_TOKENIZER] + + with OpenAIServer(args) as openai_server: + yield openai_server + + +# NOTE: The FastAPI TestClient acts like a server and triggers the FastAPI app +# lifespan startup/shutdown, but does not actually expose the network port to interact +# with arbitrary clients - you must use the TestClient returned to interact with +# the "server" when "starting the server" via TestClient. +@pytest.fixture(scope="class") +def fastapi_client_class_scope(): + model_repository = str(Path(__file__).parent / f"{TEST_BACKEND}_models") + app = setup_fastapi_app(tokenizer=TEST_TOKENIZER, model_repository=model_repository) + with TestClient(app) as test_client: + yield test_client + + +@pytest.fixture +def model(): + return TEST_MODEL + + +@pytest.fixture +def backend(): + return TEST_BACKEND + + +@pytest.fixture +def prompt(): + return TEST_PROMPT + + +@pytest.fixture +def messages(): + return TEST_MESSAGES diff --git a/qa/L0_openai/openai/src/tests/test_chat_completions.py b/qa/L0_openai/openai/src/tests/test_chat_completions.py index 28572a4acd..d02a127412 100644 --- a/qa/L0_openai/openai/src/tests/test_chat_completions.py +++ b/qa/L0_openai/openai/src/tests/test_chat_completions.py @@ -1,60 +1,21 @@ import copy -import os from pathlib import Path +from typing import List import pytest from fastapi.testclient import TestClient -from src.api_server import init_app - -### TEST ENVIRONMENT SETUP ### -TEST_BACKEND = "" -TEST_MODEL = "" -TEST_PROMPT = "What is machine learning?" -TEST_MESSAGES = [{"role": "user", "content": TEST_PROMPT}] -TEST_TOKENIZER = "meta-llama/Meta-Llama-3-8B-Instruct" -try: - import vllm as _ - - TEST_BACKEND = "vllm" - TEST_MODEL = "llama-3-8b-instruct" -except ImportError: - pass - -try: - import tensorrt_llm as _ - - TEST_BACKEND = "tensorrtllm" - TEST_MODEL = "tensorrt_llm_bls" -except ImportError: - pass - -if not TEST_BACKEND or not TEST_MODEL: - raise Exception("Unknown test environment") -### +from src.tests.utils import setup_fastapi_app class TestChatCompletions: - # TODO: Consider module/package scope, or join Completions tests into same file - # to run server only once for both sets of tests for faster iteration. @pytest.fixture(scope="class") - def client(self): - model_repository = str(Path(__file__).parent / f"{TEST_BACKEND}_models") - app = self.setup_app( - tokenizer=TEST_TOKENIZER, model_repository=model_repository - ) - with TestClient(app) as test_client: - yield test_client - - def setup_app(self, tokenizer: str, model_repository: str): - os.environ["TOKENIZER"] = tokenizer - os.environ["TRITON_MODEL_REPOSITORY"] = model_repository - app = init_app() - return app + def client(self, fastapi_client_class_scope): + yield fastapi_client_class_scope - def test_chat_completions_defaults(self, client): + def test_chat_completions_defaults(self, client, model: str, messages: List[dict]): response = client.post( "/v1/chat/completions", - json={"model": TEST_MODEL, "messages": TEST_MESSAGES}, + json={"model": model, "messages": messages}, ) assert response.status_code == 200 @@ -64,17 +25,17 @@ def test_chat_completions_defaults(self, client): # "usage" currently not supported assert response.json()["usage"] == None - def test_chat_completions_system_prompt(self, client): + def test_chat_completions_system_prompt(self, client, model: str): # NOTE: Currently just sanity check that there are no issues when a # system role is provided. There is no test logic to measure the quality # of the response yet. messages = [ {"role": "system", "content": "You are a Triton Inference Server expert."}, - {"role": "user", "content": TEST_PROMPT}, + {"role": "user", "content": "What is machine learning?"}, ] response = client.post( - "/v1/chat/completions", json={"model": TEST_MODEL, "messages": messages} + "/v1/chat/completions", json={"model": model, "messages": messages} ) assert response.status_code == 200 @@ -82,14 +43,14 @@ def test_chat_completions_system_prompt(self, client): assert message["content"].strip() assert message["role"] == "assistant" - def test_chat_completions_system_prompt_only(self, client): + def test_chat_completions_system_prompt_only(self, client, model: str): # No user prompt provided messages = [ {"role": "system", "content": "You are a Triton Inference Server expert."} ] response = client.post( - "/v1/chat/completions", json={"model": TEST_MODEL, "messages": messages} + "/v1/chat/completions", json={"model": model, "messages": messages} ) assert response.status_code == 200 @@ -98,7 +59,7 @@ def test_chat_completions_system_prompt_only(self, client): assert message["role"] == "assistant" @pytest.mark.parametrize( - "sampling_parameter, value", + "param_key, param_value", [ ("temperature", 0.7), ("max_tokens", 10), @@ -111,20 +72,20 @@ def test_chat_completions_system_prompt_only(self, client): ], ) def test_chat_completions_sampling_parameters( - self, client, sampling_parameter, value + self, client, param_key, param_value, model: str, messages: List[dict] ): response = client.post( "/v1/chat/completions", json={ - "model": TEST_MODEL, - "messages": TEST_MESSAGES, - sampling_parameter: value, + "model": model, + "messages": messages, + param_key: param_value, }, ) # TODO: Add support and remove this check unsupported_parameters = ["logprobs", "logit_bias"] - if sampling_parameter in unsupported_parameters: + if param_key in unsupported_parameters: assert response.status_code == 400 assert response.json()["detail"] == "logit bias and log probs not supported" return @@ -134,7 +95,7 @@ def test_chat_completions_sampling_parameters( assert response.json()["choices"][0]["message"]["role"] == "assistant" @pytest.mark.parametrize( - "sampling_parameter, value", + "param_key, param_value", [ ("temperature", 2.1), ("temperature", -0.1), @@ -147,14 +108,14 @@ def test_chat_completions_sampling_parameters( ], ) def test_chat_completions_invalid_sampling_parameters( - self, client, sampling_parameter, value + self, client, param_key, param_value, model: str, messages: List[dict] ): response = client.post( "/v1/chat/completions", json={ - "model": TEST_MODEL, - "messages": TEST_MESSAGES, - sampling_parameter: value, + "model": model, + "messages": messages, + param_key: param_value, }, ) @@ -162,9 +123,11 @@ def test_chat_completions_invalid_sampling_parameters( assert response.status_code == 422 # Simple tests to verify max_tokens roughly behaves as expected - def test_chat_completions_max_tokens(self, client): + def test_chat_completions_max_tokens( + self, client, model: str, messages: List[dict] + ): responses = [] - payload = {"model": TEST_MODEL, "messages": TEST_MESSAGES, "max_tokens": 1} + payload = {"model": model, "messages": messages, "max_tokens": 1} # Send two requests with max_tokens = 1 to check their similarity payload["max_tokens"] = 1 @@ -206,17 +169,22 @@ def test_chat_completions_max_tokens(self, client): assert len(response1_text) == len(response2_text) == 1 assert len(response3_text) > len(response1_text) - @pytest.mark.skipif(TEST_BACKEND != "vllm", reason="Only used to test vLLM backend") @pytest.mark.parametrize( "temperature", [0.0, 1.0], ) # Simple tests to verify temperature roughly behaves as expected - def test_chat_completions_temperature_vllm(self, client, temperature): + def test_chat_completions_temperature_vllm( + self, client, temperature, backend: str, model: str, messages: List[dict] + ): + if backend != "vllm": + pytest.skip(reason="Only used to test vLLM-specific temperature behavior") + responses = [] payload = { - "model": TEST_MODEL, - "messages": TEST_MESSAGES, + "model": model, + "messages": messages, + "max_tokens": 256, "temperature": temperature, } @@ -248,14 +216,12 @@ def test_chat_completions_temperature_vllm(self, client, temperature): # that two equivalent requests produce the same response. if temperature == 0.0: # NOTE: This check may be ambitious to get an exact match in all - # frameworks depending on how other parameter defaults are set, so + # cases depending on how other parameter defaults are set, so # it can probably be removed if it introduces flakiness. - print(f"Comparing '{response1_text}' == '{response2_text}'") assert response1_text == response2_text # Temperature of 1.0 indicates maximum randomness, so check # that two equivalent requests produce different responses. elif temperature == 1.0: - print(f"Comparing '{response1_text}' != '{response2_text}'") assert response1_text != response2_text # Don't bother checking values other than the extremes else: @@ -265,15 +231,19 @@ def test_chat_completions_temperature_vllm(self, client, temperature): @pytest.mark.xfail( reason="TRT-LLM BLS model will ignore temperature until a later release" ) - @pytest.mark.skipif( - TEST_BACKEND != "tensorrtllm", reason="Only used to test TRT-LLM backend" - ) # Simple tests to verify temperature roughly behaves as expected - def test_chat_completions_temperature_tensorrtllm(self, client): + def test_chat_completions_temperature_tensorrtllm( + self, client, backend: str, model: str, messages: List[dict] + ): + if backend != "tensorrtllm": + pytest.skip( + reason="Only used to test TRT-LLM-specific temperature behavior" + ) + responses = [] payload1 = { - "model": TEST_MODEL, - "messages": TEST_MESSAGES, + "model": model, + "messages": messages, # Increase token length to allow more room for variability "max_tokens": 200, "temperature": 0.0, @@ -324,11 +294,11 @@ def test_chat_completions_temperature_tensorrtllm(self, client): assert response1_text != response3_text # Simple tests to verify random seed roughly behaves as expected - def test_chat_completions_seed(self, client): + def test_chat_completions_seed(self, client, model: str, messages: List[dict]): responses = [] payload1 = { - "model": TEST_MODEL, - "messages": TEST_MESSAGES, + "model": model, + "messages": messages, # Increase token length to allow more room for variability "max_tokens": 200, "seed": 1, @@ -374,11 +344,13 @@ def test_chat_completions_seed(self, client): assert response1_text == response2_text assert response1_text != response3_text - def test_chat_completions_no_message(self, client): + def test_chat_completions_no_message( + self, client, model: str, messages: List[dict] + ): # Message validation requires min_length of 1 messages = [] response = client.post( - "/v1/chat/completions", json={"model": TEST_MODEL, "messages": messages} + "/v1/chat/completions", json={"model": model, "messages": messages} ) assert response.status_code == 422 assert ( @@ -386,19 +358,23 @@ def test_chat_completions_no_message(self, client): == "List should have at least 1 item after validation, not 0" ) - def test_chat_completions_empty_message(self, client): + def test_chat_completions_empty_message( + self, client, model: str, messages: List[dict] + ): # Message validation requires min_length of 1 messages = [{}] response = client.post( - "/v1/chat/completions", json={"model": TEST_MODEL, "messages": messages} + "/v1/chat/completions", json={"model": model, "messages": messages} ) assert response.status_code == 422 assert response.json()["detail"][0]["msg"] == "Field required" - def test_chat_completions_multiple_choices(self, client): + def test_chat_completions_multiple_choices( + self, client, model: str, messages: List[dict] + ): response = client.post( "/v1/chat/completions", - json={"model": TEST_MODEL, "messages": TEST_MESSAGES, "n": 2}, + json={"model": model, "messages": messages, "n": 2}, ) assert response.status_code == 400 @@ -408,10 +384,12 @@ def test_chat_completions_multiple_choices(self, client): def test_chat_completions_streaming(self, client): pass - def test_chat_completions_no_streaming(self, client): + def test_chat_completions_no_streaming( + self, client, model: str, messages: List[dict] + ): response = client.post( "/v1/chat/completions", - json={"model": TEST_MODEL, "messages": TEST_MESSAGES, "stream": False}, + json={"model": model, "messages": messages, "stream": False}, ) assert response.status_code == 200 @@ -452,22 +430,18 @@ def test_usage_response(self): # For tests that won't use the same pytest fixture for server startup across # the whole class test suite. class TestChatCompletionsCustomFixture: - def setup_app(self, tokenizer: str, model_repository: str): - os.environ["TOKENIZER"] = tokenizer - os.environ["TRITON_MODEL_REPOSITORY"] = model_repository - app = init_app() - return app - # A TOKENIZER must be known for /chat/completions endpoint in order to # apply chat templates, and for simplicity in determination, users should # define the TOKENIZER. So, explicitly raise an error if none is provided. - def test_chat_completions_no_tokenizer(self): - model_repository = str(Path(__file__).parent / f"{TEST_BACKEND}_models") - app = self.setup_app(tokenizer="", model_repository=model_repository) + def test_chat_completions_no_tokenizer( + self, backend: str, model: str, messages: List[dict] + ): + model_repository = str(Path(__file__).parent / f"{backend}_models") + app = setup_fastapi_app(model_repository=model_repository, tokenizer="") with TestClient(app) as client: response = client.post( "/v1/chat/completions", - json={"model": TEST_MODEL, "messages": TEST_MESSAGES}, + json={"model": model, "messages": messages}, ) assert response.status_code == 400 assert response.json()["detail"] == "Unknown tokenizer" diff --git a/qa/L0_openai/openai/src/tests/test_completions.py b/qa/L0_openai/openai/src/tests/test_completions.py index 2b2c78c548..e43e225988 100644 --- a/qa/L0_openai/openai/src/tests/test_completions.py +++ b/qa/L0_openai/openai/src/tests/test_completions.py @@ -1,51 +1,17 @@ import copy -import os -from pathlib import Path import pytest -from fastapi.testclient import TestClient -from src.api_server import init_app - -### TEST ENVIRONMENT SETUP ### -TEST_BACKEND = "" -TEST_MODEL = "" -TEST_PROMPT = "Machine learning is" -try: - import vllm as _ - - TEST_BACKEND = "vllm" - TEST_MODEL = "llama-3-8b-instruct" -except ImportError: - pass - -try: - import tensorrt_llm as _ - - TEST_BACKEND = "tensorrtllm" - TEST_MODEL = "tensorrt_llm_bls" -except ImportError: - pass - -if not TEST_BACKEND or not TEST_MODEL: - raise Exception("Unknown test environment") -### class TestCompletions: - # TODO: Consider module/package scope, or join ChatCompletions tests into same file - # to run server only once for both sets of tests for faster iteration. @pytest.fixture(scope="class") - def client(self): - model_repository = Path(__file__).parent / f"{TEST_BACKEND}_models" - os.environ["TRITON_MODEL_REPOSITORY"] = str(model_repository) - app = init_app() - with TestClient(app) as test_client: - yield test_client - - def test_completions_defaults(self, client): + def client(self, fastapi_client_class_scope): + yield fastapi_client_class_scope + + def test_completions_defaults(self, client, model: str, prompt: str): response = client.post( "/v1/completions", - json={"model": TEST_MODEL, "prompt": TEST_PROMPT}, + json={"model": model, "prompt": prompt}, ) print("Response:", response.json()) @@ -69,12 +35,14 @@ def test_completions_defaults(self, client): ("logit_bias", {"0": 0}), ], ) - def test_completions_sampling_parameters(self, client, sampling_parameter, value): + def test_completions_sampling_parameters( + self, client, sampling_parameter, value, model: str, prompt: str + ): response = client.post( "/v1/completions", json={ - "model": TEST_MODEL, - "prompt": TEST_PROMPT, + "model": model, + "prompt": prompt, sampling_parameter: value, }, ) @@ -91,9 +59,9 @@ def test_completions_sampling_parameters(self, client, sampling_parameter, value assert response.json()["choices"][0]["text"].strip() # Simple tests to verify max_tokens roughly behaves as expected - def test_completions_max_tokens(self, client): + def test_completions_max_tokens(self, client, model: str, prompt: str): responses = [] - payload = {"model": TEST_MODEL, "prompt": TEST_PROMPT, "max_tokens": 1} + payload = {"model": model, "prompt": prompt, "max_tokens": 1} # Send two requests with max_tokens = 1 to check their similarity payload["max_tokens"] = 1 @@ -129,17 +97,21 @@ def test_completions_max_tokens(self, client): assert len(response1_text) == len(response2_text) == 1 assert len(response3_text) > len(response1_text) - @pytest.mark.skipif(TEST_BACKEND != "vllm", reason="Only used to test vLLM backend") @pytest.mark.parametrize( "temperature", [0.0, 1.0], ) # Simple tests to verify temperature roughly behaves as expected - def test_completions_temperature_vllm(self, client, temperature): + def test_completions_temperature_vllm( + self, client, temperature, backend: str, model: str, prompt: str + ): + if backend != "vllm": + pytest.skip(reason="Only used to test vLLM-specific temperature behavior") + responses = [] payload = { - "model": TEST_MODEL, - "prompt": TEST_PROMPT, + "model": model, + "prompt": prompt, "temperature": temperature, } @@ -184,15 +156,17 @@ def test_completions_temperature_vllm(self, client, temperature): @pytest.mark.xfail( reason="TRT-LLM BLS model will ignore temperature until a later release" ) - @pytest.mark.skipif( - TEST_BACKEND != "tensorrtllm", reason="Only used to test TRT-LLM backend" - ) # Simple tests to verify temperature roughly behaves as expected - def test_completions_temperature_tensorrtllm(self, client): + def test_completions_temperature_tensorrtllm( + self, client, backend: str, model: str, prompt: str + ): + if backend != "tensorrtllm": + pytest.skip(reason="Only used to test vLLM-specific temperature behavior") + responses = [] payload1 = { - "model": TEST_MODEL, - "prompt": TEST_PROMPT, + "model": model, + "prompt": prompt, "temperature": 0.0, # TRT-LLM requires certain settings of `top_k` / `top_p` to # respect changes in `temperature` @@ -234,9 +208,9 @@ def test_completions_temperature_tensorrtllm(self, client): assert response1_text != response3_text # Simple tests to verify seed roughly behaves as expected - def test_completions_seed(self, client): + def test_completions_seed(self, client, model: str, prompt: str): responses = [] - payload1 = {"model": TEST_MODEL, "prompt": TEST_PROMPT, "seed": 1} + payload1 = {"model": model, "prompt": prompt, "seed": 1} payload2 = copy.deepcopy(payload1) payload2["seed"] = 2 @@ -286,13 +260,13 @@ def test_completions_seed(self, client): ], ) def test_completions_invalid_sampling_parameters( - self, client, sampling_parameter, value + self, client, sampling_parameter, value, model: str, prompt: str ): response = client.post( "/v1/completions", json={ - "model": TEST_MODEL, - "prompt": TEST_PROMPT, + "model": model, + "prompt": prompt, sampling_parameter: value, }, ) @@ -304,32 +278,30 @@ def test_completions_empty_request(self, client): response = client.post("/v1/completions", json={}) assert response.status_code == 422 - def test_completions_no_model(self, client): - response = client.post("/v1/completions", json={"prompt": TEST_PROMPT}) + def test_completions_no_model(self, client, prompt: str): + response = client.post("/v1/completions", json={"prompt": prompt}) assert response.status_code == 422 - def test_completions_no_prompt(self, client): - response = client.post("/v1/completions", json={"model": TEST_MODEL}) + def test_completions_no_prompt(self, client, model: str): + response = client.post("/v1/completions", json={"model": model}) assert response.status_code == 422 - def test_completions_empty_prompt(self, client): - response = client.post( - "/v1/completions", json={"model": TEST_MODEL, "prompt": ""} - ) + def test_completions_empty_prompt(self, client, model: str): + response = client.post("/v1/completions", json={"model": model, "prompt": ""}) # NOTE: Should this be validated in schema instead? # 400 Error returned in route handler assert response.status_code == 400 - def test_no_prompt(self, client): - response = client.post("/v1/completions", json={"model": TEST_MODEL}) + def test_no_prompt(self, client, model: str): + response = client.post("/v1/completions", json={"model": model}) # 422 Error returned by schema validation assert response.status_code == 422 - def test_completions_multiple_choices(self, client): + def test_completions_multiple_choices(self, client, model: str, prompt: str): response = client.post( - "/v1/completions", json={"model": TEST_MODEL, "prompt": TEST_PROMPT, "n": 2} + "/v1/completions", json={"model": model, "prompt": prompt, "n": 2} ) assert response.status_code == 400 diff --git a/qa/L0_openai/openai/src/tests/test_observability.py b/qa/L0_openai/openai/src/tests/test_observability.py index 67e1f42255..eca88a03de 100644 --- a/qa/L0_openai/openai/src/tests/test_observability.py +++ b/qa/L0_openai/openai/src/tests/test_observability.py @@ -5,7 +5,11 @@ from fastapi.testclient import TestClient from src.api_server import init_app -TEST_MODEL = "mock_llm" + +# Override conftest.py default model +@pytest.fixture +def model(): + return "mock_llm" class TestObservability: @@ -41,27 +45,27 @@ def test_startup_metrics(self, client): response = client.get("/metrics") assert response.status_code == 200 # FIXME: Flesh out more - # NOTE: response.json() works even on non-json prometheus data? + # NOTE: response.json() works even on non-json prometheus data assert "nv_cpu_utilization" in response.json() ### Models ### - def test_models_list(self, client): + def test_models_list(self, client, model): # TODO: Load multiple models and make sure exactly ALL are returned response = client.get("/v1/models") assert response.status_code == 200 - # TODO: Flesh out models = response.json()["data"] assert len(models) == 1 - assert models[0]["id"] == TEST_MODEL + assert models[0]["id"] == model assert models[0]["object"] == "model" assert models[0]["created"] > 0 + assert models[0]["owned_by"] == "Triton Inference Server" - def test_models_get(self, client): + def test_models_get(self, client, model): # TODO: Load multiple models and make sure exactly 1 is returned - response = client.get(f"/v1/models/{TEST_MODEL}") + response = client.get(f"/v1/models/{model}") assert response.status_code == 200 - # TODO: Flesh out - model = response.json() - assert model["id"] == TEST_MODEL - assert model["object"] == "model" - assert model["created"] > 0 + model_resp = response.json() + assert model_resp["id"] == model + assert model_resp["object"] == "model" + assert model_resp["created"] > 0 + assert model_resp["owned_by"] == "Triton Inference Server" diff --git a/qa/L0_openai/openai/src/tests/test_openai_client.py b/qa/L0_openai/openai/src/tests/test_openai_client.py index c6413f5614..6c61403e73 100644 --- a/qa/L0_openai/openai/src/tests/test_openai_client.py +++ b/qa/L0_openai/openai/src/tests/test_openai_client.py @@ -1,47 +1,7 @@ -from pathlib import Path +from typing import List import openai import pytest -from src.tests.utils import OpenAIServer - -### TEST ENVIRONMENT SETUP ### -TEST_BACKEND = "" -TEST_MODEL = "" -TEST_PROMPT = "What is machine learning?" -TEST_MESSAGES = [{"role": "user", "content": TEST_PROMPT}] -TEST_TOKENIZER = "meta-llama/Meta-Llama-3-8B-Instruct" -try: - import vllm as _ - - TEST_BACKEND = "vllm" - TEST_MODEL = "llama-3-8b-instruct" -except ImportError: - pass - -try: - import tensorrt_llm as _ - - TEST_BACKEND = "tensorrtllm" - TEST_MODEL = "tensorrt_llm_bls" -except ImportError: - pass - -if not TEST_BACKEND or not TEST_MODEL: - raise Exception("Unknown test environment") -### - - -# NOTE: OpenAI client requires actual server running, and won't work -# with the FastAPI TestClient. Run the server at module scope to run -# only once for all the tests below. -@pytest.fixture(scope="module") -def server(): - model_repository = Path(__file__).parent / f"{TEST_BACKEND}_models" - tokenizer = "meta-llama/Meta-Llama-3-8B-Instruct" - args = ["--model-repository", model_repository, "--tokenizer", tokenizer] - - with OpenAIServer(args) as openai_server: - yield openai_server class TestOpenAIClient: @@ -49,56 +9,52 @@ class TestOpenAIClient: def client(self, server): return server.get_client() - def test_openai_client_models(self, client: openai.OpenAI): + def test_openai_client_models(self, client: openai.OpenAI, backend: str): models = list(client.models.list()) print(f"Models: {models}") - if TEST_BACKEND == "tensorrtllm": + if backend == "tensorrtllm": # ensemble or tensorrt_llm_bls # preprocess -> tensorrt_llm -> postprocess assert len(models) == 5 - elif TEST_BACKEND == "vllm": + elif backend == "vllm": assert len(models) == 1 else: - raise Exception(f"Unexpected backend {TEST_BACKEND=}") + raise Exception(f"Unexpected backend {backend=}") - def test_openai_client_completion(self, client: openai.OpenAI): + def test_openai_client_completion( + self, client: openai.OpenAI, model: str, prompt: str + ): completion = client.completions.create( - prompt=TEST_PROMPT, - model=TEST_MODEL, + prompt=prompt, + model=model, ) print(f"Completion results: {completion}") assert completion.choices[0].text assert completion.choices[0].finish_reason == "stop" - def test_openai_client_chat_completion(self, client: openai.OpenAI): + def test_openai_client_chat_completion( + self, client: openai.OpenAI, model: str, messages: List[dict] + ): chat_completion = client.chat.completions.create( - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "Who won the world series in 2020?"}, - { - "role": "assistant", - "content": "The Los Angeles Dodgers won the World Series in 2020.", - }, - {"role": "user", "content": "Where was it played?"}, - ], - model=TEST_MODEL, + messages=messages, + model=model, ) print(f"Chat completion results: {chat_completion}") assert chat_completion.choices[0].message.content assert chat_completion.choices[0].finish_reason == "stop" - @pytest.mark.skipif( - TEST_BACKEND == "tensorrtllm", - reason="TRT-LLM backend currently only supports setting this parameter at model load time", - ) @pytest.mark.parametrize("echo", [False, True]) - def test_openai_client_completion_echo(self, client: openai.OpenAI, echo: bool): - prompt = "What is the capital of France?" - completion = client.completions.create( - prompt=prompt, model=TEST_MODEL, echo=echo - ) + def test_openai_client_completion_echo( + self, client: openai.OpenAI, echo: bool, backend: str, model: str, prompt: str + ): + if backend == "tensorrtllm": + pytest.skip( + reason="TRT-LLM backend currently only supports setting this parameter at model load time", + ) + + completion = client.completions.create(prompt=prompt, model=model, echo=echo) print(f"Completion results: {completion}") response = completion.choices[0].text @@ -118,24 +74,26 @@ def client(self, server): return server.get_async_client() @pytest.mark.asyncio - async def test_openai_client_models(self, client: openai.AsyncOpenAI): + async def test_openai_client_models(self, client: openai.AsyncOpenAI, backend: str): async_models = await client.models.list() models = [model async for model in async_models] print(f"Models: {models}") - if TEST_BACKEND == "tensorrtllm": + if backend == "tensorrtllm": # ensemble or tensorrt_llm_bls # preprocess -> tensorrt_llm -> postprocess assert len(models) == 5 - elif TEST_BACKEND == "vllm": + elif backend == "vllm": assert len(models) == 1 else: - raise Exception(f"Unexpected backend {TEST_BACKEND=}") + raise Exception(f"Unexpected backend {backend=}") @pytest.mark.asyncio - async def test_openai_client_completion(self, client: openai.AsyncOpenAI): + async def test_openai_client_completion( + self, client: openai.AsyncOpenAI, model: str, prompt: str + ): completion = await client.completions.create( - prompt=TEST_PROMPT, - model=TEST_MODEL, + prompt=prompt, + model=model, ) print(f"Completion results: {completion}") @@ -143,18 +101,12 @@ async def test_openai_client_completion(self, client: openai.AsyncOpenAI): assert completion.choices[0].finish_reason == "stop" @pytest.mark.asyncio - async def test_openai_client_chat_completion(self, client: openai.AsyncOpenAI): + async def test_openai_client_chat_completion( + self, client: openai.AsyncOpenAI, model: str, messages: List[dict] + ): chat_completion = await client.chat.completions.create( - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "Who won the world series in 2020?"}, - { - "role": "assistant", - "content": "The Los Angeles Dodgers won the World Series in 2020.", - }, - {"role": "user", "content": "Where was it played?"}, - ], - model=TEST_MODEL, + messages=messages, + model=model, ) assert chat_completion.choices[0].message.content @@ -164,19 +116,16 @@ async def test_openai_client_chat_completion(self, client: openai.AsyncOpenAI): # TODO: Add this test @pytest.mark.skip(reason="Not Implemented Yet") @pytest.mark.asyncio - async def test_completion_streaming(self, client: openai.AsyncOpenAI): + async def test_completion_streaming(self): pass @pytest.mark.asyncio - async def test_chat_streaming(self, client: openai.AsyncOpenAI): - messages = [ - {"role": "system", "content": "you are a helpful assistant"}, - {"role": "user", "content": "what is 1+1?"}, - ] - + async def test_chat_streaming( + self, client: openai.AsyncOpenAI, model: str, messages: List[dict] + ): # test single completion chat_completion = await client.chat.completions.create( - model=TEST_MODEL, + model=model, messages=messages, max_tokens=10, temperature=0.0, @@ -187,7 +136,7 @@ async def test_chat_streaming(self, client: openai.AsyncOpenAI): # test streaming stream = await client.chat.completions.create( - model=TEST_MODEL, + model=model, messages=messages, max_tokens=10, temperature=0.0, @@ -197,7 +146,6 @@ async def test_chat_streaming(self, client: openai.AsyncOpenAI): finish_reason_count = 0 async for chunk in stream: delta = chunk.choices[0].delta - print("[DEBUG] DELTA:", delta) if delta.role: assert delta.role == "assistant" if delta.content: diff --git a/qa/L0_openai/openai/src/tests/utils.py b/qa/L0_openai/openai/src/tests/utils.py index b9ef2dcb5f..d03368663a 100644 --- a/qa/L0_openai/openai/src/tests/utils.py +++ b/qa/L0_openai/openai/src/tests/utils.py @@ -7,6 +7,14 @@ import openai import requests +from src.api_server import init_app + + +def setup_fastapi_app(tokenizer: str, model_repository: str): + os.environ["TOKENIZER"] = tokenizer + os.environ["TRITON_MODEL_REPOSITORY"] = model_repository + app = init_app() + return app # Heavily inspired by vLLM's test infrastructure @@ -44,7 +52,8 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): self.proc.terminate() try: - self.proc.wait(3) + wait_secs = 30 + self.proc.wait(wait_secs) except subprocess.TimeoutExpired: # force kill if needed self.proc.kill() diff --git a/qa/L0_openai/openai/src/tests/vllm_models/llama-3-8b-instruct/1/model.json b/qa/L0_openai/openai/src/tests/vllm_models/llama-3-8b-instruct/1/model.json deleted file mode 100644 index e60275ce16..0000000000 --- a/qa/L0_openai/openai/src/tests/vllm_models/llama-3-8b-instruct/1/model.json +++ /dev/null @@ -1 +0,0 @@ -{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "disable_log_requests": true, "gpu_memory_utilization": 0.85} \ No newline at end of file diff --git a/qa/L0_openai/openai/src/tests/vllm_models/llama-3-8b-instruct/config.pbtxt b/qa/L0_openai/openai/src/tests/vllm_models/llama-3-8b-instruct/config.pbtxt deleted file mode 100644 index a3edd238de..0000000000 --- a/qa/L0_openai/openai/src/tests/vllm_models/llama-3-8b-instruct/config.pbtxt +++ /dev/null @@ -1,2 +0,0 @@ -backend: "vllm" -instance_group { kind: KIND_MODEL } diff --git a/qa/L0_openai/openai/src/tests/vllm_models/llama-3.1-8b-instruct/1/model.json b/qa/L0_openai/openai/src/tests/vllm_models/llama-3.1-8b-instruct/1/model.json new file mode 100644 index 0000000000..00f18b88bd --- /dev/null +++ b/qa/L0_openai/openai/src/tests/vllm_models/llama-3.1-8b-instruct/1/model.json @@ -0,0 +1 @@ +{"model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "disable_log_requests": true, "gpu_memory_utilization": 0.85} \ No newline at end of file diff --git a/qa/L0_openai/openai/src/tests/vllm_models/llama-3.1-8b-instruct/config.pbtxt b/qa/L0_openai/openai/src/tests/vllm_models/llama-3.1-8b-instruct/config.pbtxt new file mode 100644 index 0000000000..4ad6534943 --- /dev/null +++ b/qa/L0_openai/openai/src/tests/vllm_models/llama-3.1-8b-instruct/config.pbtxt @@ -0,0 +1,2 @@ +backend: "vllm" +instance_group [{kind: KIND_MODEL}] \ No newline at end of file From 6cf2e772db51a877e146ddbf9d712c5341dfd41c Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Fri, 16 Aug 2024 12:31:03 -0700 Subject: [PATCH 20/80] Split Dockefile in 2 to capture llama3.1 requirement for vllm --- qa/L0_openai/openai/Dockerfile.trtllm | 4 ++++ qa/L0_openai/openai/Dockerfile.vllm | 6 ++++++ 2 files changed, 10 insertions(+) create mode 100644 qa/L0_openai/openai/Dockerfile.trtllm create mode 100644 qa/L0_openai/openai/Dockerfile.vllm diff --git a/qa/L0_openai/openai/Dockerfile.trtllm b/qa/L0_openai/openai/Dockerfile.trtllm new file mode 100644 index 0000000000..1128cc4355 --- /dev/null +++ b/qa/L0_openai/openai/Dockerfile.trtllm @@ -0,0 +1,4 @@ +ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 +FROM ${BASE_IMAGE} +RUN pip install /opt/tritonserver/python/*.whl +RUN pip install "fastapi==0.111.1" "pytest==8.1.1" "openai==1.40.6" "pytest-asyncio==0.23.8" diff --git a/qa/L0_openai/openai/Dockerfile.vllm b/qa/L0_openai/openai/Dockerfile.vllm new file mode 100644 index 0000000000..dbb8a5f63d --- /dev/null +++ b/qa/L0_openai/openai/Dockerfile.vllm @@ -0,0 +1,6 @@ +ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.07-vllm-python-py3 +FROM ${BASE_IMAGE} +RUN pip install /opt/tritonserver/python/*.whl +# NOTE: Newer vllm version upgrade to support Llama3.1 in 24.07 container. +# This should be unnecessary in 24.08 container. +RUN pip install "fastapi==0.111.1" "pytest==8.1.1" "openai==1.40.6" "pytest-asyncio==0.23.8" "vllm==0.5.3.post1" From 66afc48c4bca1a4016fb9ab6dd240506aebbcf22 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Fri, 16 Aug 2024 12:31:33 -0700 Subject: [PATCH 21/80] Split Dockerfile in 2 to capture llama3.1 requirement for vllm --- qa/L0_openai/openai/Dockerfile | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 qa/L0_openai/openai/Dockerfile diff --git a/qa/L0_openai/openai/Dockerfile b/qa/L0_openai/openai/Dockerfile deleted file mode 100644 index c053959a75..0000000000 --- a/qa/L0_openai/openai/Dockerfile +++ /dev/null @@ -1,5 +0,0 @@ -ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.07-vllm-python-py3 -FROM ${BASE_IMAGE} -RUN pip install /opt/tritonserver/python/*.whl -# TODO: Add to requirements.txt -RUN pip install "fastapi==0.111.1" "pytest==8.1.1" "openai==1.40.6" "pytest-asyncio==0.23.8" From 0bbd248babcd3ed5b7e0f95072523d5ac26faf97 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Fri, 16 Aug 2024 12:31:55 -0700 Subject: [PATCH 22/80] Add configurable model parameter to examples --- qa/L0_openai/examples/chat.sh | 3 ++- qa/L0_openai/examples/openai_client.py | 8 +++++++- .../examples/{streaming_curl.sh => streaming_chat.sh} | 0 3 files changed, 9 insertions(+), 2 deletions(-) rename qa/L0_openai/examples/{streaming_curl.sh => streaming_chat.sh} (100%) diff --git a/qa/L0_openai/examples/chat.sh b/qa/L0_openai/examples/chat.sh index a8f3d6d8ef..94985e74de 100755 --- a/qa/L0_openai/examples/chat.sh +++ b/qa/L0_openai/examples/chat.sh @@ -1,7 +1,8 @@ #!/bin/bash +MODEL=${1:-"tensorrt_llm_bls"} curl -s http://localhost:8000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "tensorrt_llm_bls", + "model": "'${MODEL}'", "messages": [{"role": "user", "content": "Say this is a test!"}] }' | jq diff --git a/qa/L0_openai/examples/openai_client.py b/qa/L0_openai/examples/openai_client.py index 0b7184af22..d4fcad983b 100755 --- a/qa/L0_openai/examples/openai_client.py +++ b/qa/L0_openai/examples/openai_client.py @@ -1,13 +1,19 @@ #!/usr/bin/env python3 +import sys + from openai import OpenAI +model = "tensorrt_llm_bls" +if len(sys.argv) > 1: + model = sys.argv[1] + client = OpenAI( base_url="http://localhost:8000/v1", api_key="EMPTY", ) completion = client.chat.completions.create( - model="tensorrt_llm_bls", + model=model, messages=[ { "role": "system", diff --git a/qa/L0_openai/examples/streaming_curl.sh b/qa/L0_openai/examples/streaming_chat.sh similarity index 100% rename from qa/L0_openai/examples/streaming_curl.sh rename to qa/L0_openai/examples/streaming_chat.sh From 6e59f6e217855f59e307d3d27589f4f0e00a1ea7 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Mon, 19 Aug 2024 15:31:15 -0700 Subject: [PATCH 23/80] Fix streaming for genai-perf by setting the content-type to text/event-stream --- .gitignore | 1 + qa/L0_openai/examples/genai_perf.sh | 6 +++--- qa/L0_openai/openai/src/routers/chat_completions.py | 3 ++- qa/L0_openai/openai/src/routers/completions.py | 5 ++++- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index f1b69cb25e..d553dfde16 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ __pycache__ tmp *.log test_results.txt +artifacts diff --git a/qa/L0_openai/examples/genai_perf.sh b/qa/L0_openai/examples/genai_perf.sh index 1e3f44edcf..cc16813796 100755 --- a/qa/L0_openai/examples/genai_perf.sh +++ b/qa/L0_openai/examples/genai_perf.sh @@ -1,6 +1,7 @@ #!/bin/bash +MODEL=${1:-"tensorrt_llm_bls"} genai-perf \ - --model tensorrt_llm_bls \ + --model ${MODEL} \ --tokenizer meta-llama/Meta-Llama-3-8B-Instruct \ --service-kind openai \ --endpoint-type chat \ @@ -8,5 +9,4 @@ genai-perf \ --synthetic-input-tokens-stddev 0 \ --output-tokens-mean 256 \ --output-tokens-stddev 0 \ - # --streaming - # --extra-inputs stream:true + --streaming diff --git a/qa/L0_openai/openai/src/routers/chat_completions.py b/qa/L0_openai/openai/src/routers/chat_completions.py index 762cb14df8..a970574e71 100644 --- a/qa/L0_openai/openai/src/routers/chat_completions.py +++ b/qa/L0_openai/openai/src/routers/chat_completions.py @@ -149,7 +149,8 @@ def create_chat_completion( return StreamingResponse( streaming_chat_completion_response( request_id, created, request.model, role, responses - ) + ), + media_type="text/event-stream", ) # Response validation with decoupled models in mind diff --git a/qa/L0_openai/openai/src/routers/completions.py b/qa/L0_openai/openai/src/routers/completions.py index 92dee5807d..5d1e9b12fa 100644 --- a/qa/L0_openai/openai/src/routers/completions.py +++ b/qa/L0_openai/openai/src/routers/completions.py @@ -97,7 +97,10 @@ def create_completion( ) if request.stream: return StreamingResponse( - streaming_completion_response(request_id, created, metadata.name, responses) + streaming_completion_response( + request_id, created, metadata.name, responses + ), + media_type="text/event-stream", ) # Response validation with decoupled models in mind From 763b3a4f85a6e8f82447fa213f77d94685af067d Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Mon, 19 Aug 2024 15:33:33 -0700 Subject: [PATCH 24/80] Update examples to default to vllm model for simplicity --- qa/L0_openai/examples/chat.sh | 3 ++- qa/L0_openai/examples/genai_perf.sh | 2 +- qa/L0_openai/examples/openai_client.py | 3 ++- qa/L0_openai/examples/streaming_chat.sh | 3 ++- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/qa/L0_openai/examples/chat.sh b/qa/L0_openai/examples/chat.sh index 94985e74de..5a7bb9b656 100755 --- a/qa/L0_openai/examples/chat.sh +++ b/qa/L0_openai/examples/chat.sh @@ -1,5 +1,6 @@ #!/bin/bash -MODEL=${1:-"tensorrt_llm_bls"} +# or "tensorrt_llm_bls" for TRT-LLM +MODEL=${1:-"llama-3.1-8b-instruct"} curl -s http://localhost:8000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ diff --git a/qa/L0_openai/examples/genai_perf.sh b/qa/L0_openai/examples/genai_perf.sh index cc16813796..b1e3716fb6 100755 --- a/qa/L0_openai/examples/genai_perf.sh +++ b/qa/L0_openai/examples/genai_perf.sh @@ -1,5 +1,5 @@ #!/bin/bash -MODEL=${1:-"tensorrt_llm_bls"} +MODEL=${1:-"llama-3.1-8b-instruct"} genai-perf \ --model ${MODEL} \ --tokenizer meta-llama/Meta-Llama-3-8B-Instruct \ diff --git a/qa/L0_openai/examples/openai_client.py b/qa/L0_openai/examples/openai_client.py index d4fcad983b..913aac44a0 100755 --- a/qa/L0_openai/examples/openai_client.py +++ b/qa/L0_openai/examples/openai_client.py @@ -3,7 +3,8 @@ from openai import OpenAI -model = "tensorrt_llm_bls" +# or "tensorrt_llm_bls" for TRT-LLM +model = "llama-3.1-8b-instruct" if len(sys.argv) > 1: model = sys.argv[1] diff --git a/qa/L0_openai/examples/streaming_chat.sh b/qa/L0_openai/examples/streaming_chat.sh index 33bebe253e..6ace5434d2 100755 --- a/qa/L0_openai/examples/streaming_chat.sh +++ b/qa/L0_openai/examples/streaming_chat.sh @@ -1,5 +1,6 @@ #!/bin/bash -MODEL=${1:-"tensorrt_llm_bls"} +# or "tensorrt_llm_bls" for TRT-LLM +MODEL=${1:-"llama-3.1-8b-instruct"} curl -s -N http://localhost:8000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ From 0328ea6b94b32e54442c431eb8dcc50a8ccedce5 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Mon, 19 Aug 2024 15:34:22 -0700 Subject: [PATCH 25/80] Start high level README for other developers --- qa/L0_openai/openai/README.md | 133 +++++++++++++++++++++++++++++++--- 1 file changed, 123 insertions(+), 10 deletions(-) diff --git a/qa/L0_openai/openai/README.md b/qa/L0_openai/openai/README.md index 948889c916..8bef9fd1d4 100644 --- a/qa/L0_openai/openai/README.md +++ b/qa/L0_openai/openai/README.md @@ -1,15 +1,128 @@ -Goal: +# OpenAI-Compatible Frontend for Triton Inference Server +## Pre-requisites + +1. Docker + NVIDIA Container Runtime +2. A correctly configured `HF_TOKEN` for access to HuggingFace models. + - The current examples and testing primarily use the + [`meta-llama/Meta-Llama-3-8B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) + model, but you can manually bring your own models and adjust accordingly. + +## VLLM + +1. Build and launch the container: +```bash +docker build -t tritonserver-openai-vllm -f Dockerfile.vllm . +# NOTE: The volume mount is flexible as long as you can access +# all the source files within the container. +docker run -it --net=host --gpus all --rm \ + -v ${PWD}:/workspace \ + -w /workspace \ + tritonserver-openai-vllm +``` + +2. Launch the OpenAI server: +```bash +# NOTE: Adjust the --tokenizer based on the model being used +python3 main.py --model-repository src/tests/vllm_models/ --tokenizer meta-llama/Meta-Llama-3.1-8B-Instruct +``` + +3. Send a `/chat/completions` request: +```bash +MODEL="llama-3.1-8b-instruct" +curl -s http://localhost:8000/v1/chat/completions -H 'Content-Type: application/json' -d '{ + "model": "'${MODEL}'", + "messages": [{"role": "user", "content": "Say this is a test!"}] +}' ``` -docker build -t tritonserver-openai:latest . + +4. Send a `/completions` request: +```bash +MODEL="llama-3.1-8b-instruct" +curl -s http://localhost:8000/v1/completions -H 'Content-Type: application/json' -d '{ + "model": "'${MODEL}'", + "prompt": "Machine learning is" +}' +``` + +5. Benchmark with `genai-perf`: +```bash +MODEL="llama-3.1-8b-instruct" +TOKENIZER="meta-llama/Meta-Llama-3-8B-Instruct" +genai-perf \ + --model ${MODEL} \ + --tokenizer ${TOKENIZER} \ + --service-kind openai \ + --endpoint-type chat \ + --synthetic-input-tokens-mean 256 \ + --synthetic-input-tokens-stddev 0 \ + --output-tokens-mean 256 \ + --output-tokens-stddev 0 \ + --streaming +``` + +6. Use an OpenAI client: +```python +from openai import OpenAI + +client = OpenAI( + base_url="http://localhost:8000/v1", + api_key="EMPTY", +) + +model = "llama-3.1-8b-instruct" +completion = client.chat.completions.create( + model=model, + messages=[ + { + "role": "system", + "content": "You are a helpful assistant.", + }, + {"role": "user", "content": "What are LLMs?"}, + ], + max_tokens=256, +) + +print(completion.choices[0].message.content) +``` + +7. Run tests (NOTE: The server should not be running, the tests will handle starting/stopping the server as necessary): +``` +cd src/tests +pytest -v +``` + +8. For other examples, see the `examples/` folder. + +## TensorRT-LLM + +0. `[TODO]` Prepare your model repository for a TensorRT-LLM model, build the engine, etc. + +1. Build and launch the container: +``` +docker build -t tritonserver-openai-tensorrtllm -f Dockerfile.tensorrtllm . +# NOTE: The volume mount is flexible as long as you can access +# all the source files within the container. docker run -it --net=host --gpus all --rm \ - tritonserver-openai:latest \ - --model gpt2 + -v ${PWD}:/workspace \ + -w /workspace \ + tritonserver-openai-tensorrtllm +``` + +2. Launch the OpenAI server: +``` +# NOTE: Adjust the --tokenizer based on the model being used +python3 main.py --model-repository src/tests/tensorrt_llm_models/ --tokenizer meta-llama/Meta-Llama-3.1-8B-Instruct +``` + +3. Send requests: +``` +MODEL="tensorrt_llm_bls" +curl -s http://localhost:8000/v1/chat/completions -H 'Content-Type: application/json' -d '{ + "model": "'${MODEL}'", + "messages": [{"role": "user", "content": "Say this is a test!"}] +}' ``` -Testing: -- Verify known issues are fixed or not - - concurrency, parameter corruption, etc. - - check out Tanmay's fix for using numpy arrays instead of native types - - exclude_input_in_output overwritten at high concurrency? - - ? +The other examples should be the same as vLLM, except that you should set `MODEL="tensorrt_llm_bls"`, +everywhere applicable as seen in the example request above. From 43dd3293471508c8cae7720fde3c5e875da97388 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Mon, 19 Aug 2024 16:03:25 -0700 Subject: [PATCH 26/80] Move openai source code into server/python/openai folder, and flesh out README accordingly --- python/openai/README.md | 141 +++ python/openai/docker/Dockerfile.tensorrtllm | 4 + python/openai/docker/Dockerfile.vllm | 6 + python/openai/examples/chat.sh | 9 + python/openai/examples/genai_perf.sh | 12 + python/openai/examples/models.sh | 3 + python/openai/examples/openai_client.py | 28 + python/openai/examples/streaming_chat.sh | 19 + python/openai/openai/main.py | 67 ++ python/openai/openai/src/__init__.py | 0 python/openai/openai/src/api_server.py | 74 ++ python/openai/openai/src/routers/__init__.py | 0 .../openai/src/routers/chat_completions.py | 178 ++++ .../openai/openai/src/routers/completions.py | 125 +++ python/openai/openai/src/routers/models.py | 59 ++ .../openai/src/routers/observability.py | 24 + python/openai/openai/src/schemas/__init__.py | 0 python/openai/openai/src/schemas/openai.py | 871 ++++++++++++++++++ python/openai/openai/src/utils/__init__.py | 0 python/openai/openai/src/utils/tokenizer.py | 77 ++ python/openai/openai/src/utils/triton.py | 219 +++++ python/openai/openai/tests/__init__.py | 0 python/openai/openai/tests/conftest.py | 75 ++ .../tests/tensorrtllm_models/ensemble/1/.tmp | 0 .../tensorrtllm_models/ensemble/config.pbtxt | 470 ++++++++++ .../postprocessing/1/model.py | 246 +++++ .../postprocessing/config.pbtxt | 113 +++ .../preprocessing/1/model.py | 418 +++++++++ .../preprocessing/config.pbtxt | 156 ++++ .../tensorrt_llm/1/.gitkeep | 0 .../tensorrt_llm/1/model.py | 797 ++++++++++++++++ .../tensorrt_llm/config.pbtxt | 542 +++++++++++ .../tensorrt_llm_bls/1/lib/decode.py | 347 +++++++ .../tensorrt_llm_bls/1/lib/triton_decoder.py | 478 ++++++++++ .../tensorrt_llm_bls/1/model.py | 137 +++ .../tensorrt_llm_bls/config.pbtxt | 252 +++++ .../openai/tests/test_chat_completions.py | 447 +++++++++ .../openai/openai/tests/test_completions.py | 321 +++++++ .../tests/test_models/mock_llm/1/model.py | 108 +++ .../tests/test_models/mock_llm/config.pbtxt | 60 ++ .../openai/openai/tests/test_observability.py | 71 ++ .../openai/openai/tests/test_openai_client.py | 163 ++++ python/openai/openai/tests/utils.py | 93 ++ .../llama-3.1-8b-instruct/1/model.json | 1 + .../llama-3.1-8b-instruct/config.pbtxt | 2 + 45 files changed, 7213 insertions(+) create mode 100644 python/openai/README.md create mode 100644 python/openai/docker/Dockerfile.tensorrtllm create mode 100644 python/openai/docker/Dockerfile.vllm create mode 100755 python/openai/examples/chat.sh create mode 100755 python/openai/examples/genai_perf.sh create mode 100755 python/openai/examples/models.sh create mode 100755 python/openai/examples/openai_client.py create mode 100755 python/openai/examples/streaming_chat.sh create mode 100755 python/openai/openai/main.py create mode 100644 python/openai/openai/src/__init__.py create mode 100644 python/openai/openai/src/api_server.py create mode 100644 python/openai/openai/src/routers/__init__.py create mode 100644 python/openai/openai/src/routers/chat_completions.py create mode 100644 python/openai/openai/src/routers/completions.py create mode 100644 python/openai/openai/src/routers/models.py create mode 100644 python/openai/openai/src/routers/observability.py create mode 100644 python/openai/openai/src/schemas/__init__.py create mode 100644 python/openai/openai/src/schemas/openai.py create mode 100644 python/openai/openai/src/utils/__init__.py create mode 100644 python/openai/openai/src/utils/tokenizer.py create mode 100644 python/openai/openai/src/utils/triton.py create mode 100644 python/openai/openai/tests/__init__.py create mode 100644 python/openai/openai/tests/conftest.py create mode 100644 python/openai/openai/tests/tensorrtllm_models/ensemble/1/.tmp create mode 100644 python/openai/openai/tests/tensorrtllm_models/ensemble/config.pbtxt create mode 100644 python/openai/openai/tests/tensorrtllm_models/postprocessing/1/model.py create mode 100644 python/openai/openai/tests/tensorrtllm_models/postprocessing/config.pbtxt create mode 100644 python/openai/openai/tests/tensorrtllm_models/preprocessing/1/model.py create mode 100644 python/openai/openai/tests/tensorrtllm_models/preprocessing/config.pbtxt create mode 100644 python/openai/openai/tests/tensorrtllm_models/tensorrt_llm/1/.gitkeep create mode 100644 python/openai/openai/tests/tensorrtllm_models/tensorrt_llm/1/model.py create mode 100644 python/openai/openai/tests/tensorrtllm_models/tensorrt_llm/config.pbtxt create mode 100644 python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/decode.py create mode 100644 python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/triton_decoder.py create mode 100644 python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/1/model.py create mode 100644 python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/config.pbtxt create mode 100644 python/openai/openai/tests/test_chat_completions.py create mode 100644 python/openai/openai/tests/test_completions.py create mode 100644 python/openai/openai/tests/test_models/mock_llm/1/model.py create mode 100644 python/openai/openai/tests/test_models/mock_llm/config.pbtxt create mode 100644 python/openai/openai/tests/test_observability.py create mode 100644 python/openai/openai/tests/test_openai_client.py create mode 100644 python/openai/openai/tests/utils.py create mode 100644 python/openai/openai/tests/vllm_models/llama-3.1-8b-instruct/1/model.json create mode 100644 python/openai/openai/tests/vllm_models/llama-3.1-8b-instruct/config.pbtxt diff --git a/python/openai/README.md b/python/openai/README.md new file mode 100644 index 0000000000..c8baf0a8e8 --- /dev/null +++ b/python/openai/README.md @@ -0,0 +1,141 @@ +# OpenAI-Compatible Frontend for Triton Inference Server + +## Pre-requisites + +1. Docker + NVIDIA Container Runtime +2. A correctly configured `HF_TOKEN` for access to HuggingFace models. + - The current examples and testing primarily use the + [`meta-llama/Meta-Llama-3-8B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) + model, but you can manually bring your own models and adjust accordingly. + +## VLLM + +1. Build and launch the container: + - Mounts the openai source files to `/workspace` for simplicity, later on these will be shipped in the container. + - Mounts the `~/.huggingface/cache` for re-use of downloaded models across runs, containers, etc. + - Sets the [`HF_TOKEN`](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hftoken) environment variable to + access gated models, make sure this is set in your local environment if needed. + +```bash +docker build -t tritonserver-openai-vllm -f docker/Dockerfile.vllm . +docker run -it --net=host --gpus all --rm \ + -v ${PWD}:/workspace \ + -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \ + -e HF_TOKEN \ + -w /workspace \ + tritonserver-openai-vllm +``` + +2. Launch the OpenAI server: +```bash +# NOTE: Adjust the --tokenizer based on the model being used +python3 openai/main.py --model-repository openai/tests/vllm_models/ --tokenizer meta-llama/Meta-Llama-3.1-8B-Instruct +``` + +3. Send a `/v1/chat/completions` request: + - Note the use of `jq` is optional, but provides a nicely formatted output for JSON responses. +```bash +MODEL="llama-3.1-8b-instruct" +curl -s http://localhost:8000/v1/chat/completions -H 'Content-Type: application/json' -d '{ + "model": "'${MODEL}'", + "messages": [{"role": "user", "content": "Say this is a test!"}] +}' | jq +``` + +4. Send a `/v1/completions` request: + - Note the use of `jq` is optional, but provides a nicely formatted output for JSON responses. +```bash +MODEL="llama-3.1-8b-instruct" +curl -s http://localhost:8000/v1/completions -H 'Content-Type: application/json' -d '{ + "model": "'${MODEL}'", + "prompt": "Machine learning is" +}' | jq +``` + +5. Benchmark with `genai-perf`: +```bash +MODEL="llama-3.1-8b-instruct" +TOKENIZER="meta-llama/Meta-Llama-3-8B-Instruct" +genai-perf \ + --model ${MODEL} \ + --tokenizer ${TOKENIZER} \ + --service-kind openai \ + --endpoint-type chat \ + --synthetic-input-tokens-mean 256 \ + --synthetic-input-tokens-stddev 0 \ + --output-tokens-mean 256 \ + --output-tokens-stddev 0 \ + --streaming +``` + +6. Use the OpenAI python client directly: +```python +from openai import OpenAI + +client = OpenAI( + base_url="http://localhost:8000/v1", + api_key="EMPTY", +) + +model = "llama-3.1-8b-instruct" +completion = client.chat.completions.create( + model=model, + messages=[ + { + "role": "system", + "content": "You are a helpful assistant.", + }, + {"role": "user", "content": "What are LLMs?"}, + ], + max_tokens=256, +) + +print(completion.choices[0].message.content) +``` + +7. Run tests (NOTE: The server should not be running, the tests will handle starting/stopping the server as necessary): +``` +cd openai/tests/ +pytest -v +``` + +8. For a list of examples, see the `examples/` folder. + +## TensorRT-LLM + +0. `[TODO]` Prepare your model repository for a TensorRT-LLM model, build the engine, etc. + +1. Build and launch the container: + - Mounts the openai source files to `/workspace` for simplicity, later on these will be shipped in the container. + - Mounts the `~/.huggingface/cache` for re-use of downloaded models across runs, containers, etc. + - Sets the [`HF_TOKEN`](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hftoken) environment variable to + access gated models, make sure this is set in your local environment if needed. + +```bash +docker build -t tritonserver-openai-vllm -f docker/Dockerfile.tensorttllm . +docker run -it --net=host --gpus all --rm \ + -v ${PWD}:/workspace \ + -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \ + -e HF_TOKEN \ + -w /workspace \ + tritonserver-openai-tensorrtllm +``` + +2. Launch the OpenAI server: +```bash +# NOTE: Adjust the --tokenizer based on the model being used +python3 openai/main.py --model-repository openai/tests/tensorrtllm_models/ --tokenizer meta-llama/Meta-Llama-3.1-8B-Instruct +``` + +3. Send a `/v1/chat/completions` request: + - Note the use of `jq` is optional, but provides a nicely formatted output for JSON responses. +```bash +MODEL="tensorrt_llm_bls" +curl -s http://localhost:8000/v1/chat/completions -H 'Content-Type: application/json' -d '{ + "model": "'${MODEL}'", + "messages": [{"role": "user", "content": "Say this is a test!"}] +}' | jq +``` + +The other examples should be the same as vLLM, except that you should set `MODEL="tensorrt_llm_bls"`, +everywhere applicable as seen in the example request above. diff --git a/python/openai/docker/Dockerfile.tensorrtllm b/python/openai/docker/Dockerfile.tensorrtllm new file mode 100644 index 0000000000..1128cc4355 --- /dev/null +++ b/python/openai/docker/Dockerfile.tensorrtllm @@ -0,0 +1,4 @@ +ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 +FROM ${BASE_IMAGE} +RUN pip install /opt/tritonserver/python/*.whl +RUN pip install "fastapi==0.111.1" "pytest==8.1.1" "openai==1.40.6" "pytest-asyncio==0.23.8" diff --git a/python/openai/docker/Dockerfile.vllm b/python/openai/docker/Dockerfile.vllm new file mode 100644 index 0000000000..dbb8a5f63d --- /dev/null +++ b/python/openai/docker/Dockerfile.vllm @@ -0,0 +1,6 @@ +ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.07-vllm-python-py3 +FROM ${BASE_IMAGE} +RUN pip install /opt/tritonserver/python/*.whl +# NOTE: Newer vllm version upgrade to support Llama3.1 in 24.07 container. +# This should be unnecessary in 24.08 container. +RUN pip install "fastapi==0.111.1" "pytest==8.1.1" "openai==1.40.6" "pytest-asyncio==0.23.8" "vllm==0.5.3.post1" diff --git a/python/openai/examples/chat.sh b/python/openai/examples/chat.sh new file mode 100755 index 0000000000..5a7bb9b656 --- /dev/null +++ b/python/openai/examples/chat.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# or "tensorrt_llm_bls" for TRT-LLM +MODEL=${1:-"llama-3.1-8b-instruct"} +curl -s http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "'${MODEL}'", + "messages": [{"role": "user", "content": "Say this is a test!"}] + }' | jq diff --git a/python/openai/examples/genai_perf.sh b/python/openai/examples/genai_perf.sh new file mode 100755 index 0000000000..b1e3716fb6 --- /dev/null +++ b/python/openai/examples/genai_perf.sh @@ -0,0 +1,12 @@ +#!/bin/bash +MODEL=${1:-"llama-3.1-8b-instruct"} +genai-perf \ + --model ${MODEL} \ + --tokenizer meta-llama/Meta-Llama-3-8B-Instruct \ + --service-kind openai \ + --endpoint-type chat \ + --synthetic-input-tokens-mean 256 \ + --synthetic-input-tokens-stddev 0 \ + --output-tokens-mean 256 \ + --output-tokens-stddev 0 \ + --streaming diff --git a/python/openai/examples/models.sh b/python/openai/examples/models.sh new file mode 100755 index 0000000000..944fbe07af --- /dev/null +++ b/python/openai/examples/models.sh @@ -0,0 +1,3 @@ +#!/bin/bash +curl -s http://localhost:8000/v1/models \ + -H "Content-Type: application/json" | jq diff --git a/python/openai/examples/openai_client.py b/python/openai/examples/openai_client.py new file mode 100755 index 0000000000..913aac44a0 --- /dev/null +++ b/python/openai/examples/openai_client.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 +import sys + +from openai import OpenAI + +# or "tensorrt_llm_bls" for TRT-LLM +model = "llama-3.1-8b-instruct" +if len(sys.argv) > 1: + model = sys.argv[1] + +client = OpenAI( + base_url="http://localhost:8000/v1", + api_key="EMPTY", +) + +completion = client.chat.completions.create( + model=model, + messages=[ + { + "role": "system", + "content": "You are a helpful assistant.", + }, + {"role": "user", "content": "What are LLMs?"}, + ], + max_tokens=256, +) + +print(completion.choices[0].message.content) diff --git a/python/openai/examples/streaming_chat.sh b/python/openai/examples/streaming_chat.sh new file mode 100755 index 0000000000..6ace5434d2 --- /dev/null +++ b/python/openai/examples/streaming_chat.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# or "tensorrt_llm_bls" for TRT-LLM +MODEL=${1:-"llama-3.1-8b-instruct"} +curl -s -N http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "'${MODEL}'", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Hello!" + } + ], + "stream": true + }' diff --git a/python/openai/openai/main.py b/python/openai/openai/main.py new file mode 100755 index 0000000000..4f6f11a9f9 --- /dev/null +++ b/python/openai/openai/main.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +import argparse +import os + +import uvicorn +from src.api_server import init_app + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Triton OpenAI Compatible RESTful API server." + ) + # Uvicorn + uvicorn_group = parser.add_argument_group("Uvicorn") + uvicorn_group.add_argument("--host", type=str, default=None, help="host name") + uvicorn_group.add_argument("--port", type=int, default=8000, help="port number") + uvicorn_group.add_argument( + "--uvicorn-log-level", + type=str, + default="info", + choices=["debug", "info", "warning", "error", "critical", "trace"], + help="log level for uvicorn", + ) + + # Triton + triton_group = parser.add_argument_group("Triton Inference Server") + triton_group.add_argument( + "--tritonserver-log-level", + type=int, + default=0, + help="The tritonserver log verbosity level", + ) + triton_group.add_argument( + "--model-repository", + type=str, + default=None, + help="Path to the Triton model repository holding the models to be served", + ) + triton_group.add_argument( + "--tokenizer", + type=str, + default=None, + help="HuggingFace ID of the Tokenizer to use for chat templates", + ) + + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + # NOTE: Think about other ways to pass triton args to fastapi app, + # but use env vars for simplicity for now. + if args.model_repository: + os.environ["TRITON_MODEL_REPOSITORY"] = args.model_repository + if args.tokenizer: + os.environ["TOKENIZER"] = args.tokenizer + + os.environ["TRITON_LOG_VERBOSE_LEVEL"] = str(args.tritonserver_log_level) + + app = init_app() + uvicorn.run( + app, + host=args.host, + port=args.port, + log_level=args.uvicorn_log_level, + timeout_keep_alive=5, + ) diff --git a/python/openai/openai/src/__init__.py b/python/openai/openai/src/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/openai/openai/src/api_server.py b/python/openai/openai/src/api_server.py new file mode 100644 index 0000000000..1b7543a4a0 --- /dev/null +++ b/python/openai/openai/src/api_server.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +from contextlib import asynccontextmanager + +import tritonserver +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +from src.routers import chat_completions, completions, models, observability +from src.utils.triton import init_tritonserver + + +def add_cors_middleware(app: FastAPI): + # Allow API calls through browser /docs route for debug purposes + origins = [ + "http://localhost", + ] + + print(f"[WARNING] Adding CORS for the following origins: {origins}") + app.add_middleware( + CORSMiddleware, + allow_origins=origins, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) + + +@asynccontextmanager +async def lifespan(app: FastAPI): + print("Starting FastAPI app lifespan...") + # Start the tritonserver on FastAPI app startup + server, model_metadatas = init_tritonserver() + app.server = server + app.models = {metadata.name: metadata for metadata in model_metadatas} + + yield + + # Cleanup the tritonserver on FastAPI app shutdown + print("Shutting down FastAPI app lifespan...") + if app.server: + print("Shutting down Triton Inference Server...") + try: + app.server.stop() + # Log error, but don't raise on shutdown + except tritonserver.InternalError as e: + print(e) + + +def init_app(): + app = FastAPI( + title="OpenAI API", + description="The OpenAI REST API. Please see https://platform.openai.com/docs/api-reference for more details.", + version="2.0.0", + termsOfService="https://openai.com/policies/terms-of-use", + contact={"name": "OpenAI Support", "url": "https://help.openai.com/"}, + license={ + "name": "MIT", + "url": "https://github.com/openai/openai-openapi/blob/master/LICENSE", + }, + lifespan=lifespan, + ) + + app.include_router(observability.router) + app.include_router(models.router) + app.include_router(completions.router) + app.include_router(chat_completions.router) + + # NOTE: For debugging purposes, should generally be restricted or removed + add_cors_middleware(app) + + # TODO: Add common logger and use logger.debug in place of current print + # statements for debugging purposes. + + return app diff --git a/python/openai/openai/src/routers/__init__.py b/python/openai/openai/src/routers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/openai/openai/src/routers/chat_completions.py b/python/openai/openai/src/routers/chat_completions.py new file mode 100644 index 0000000000..a970574e71 --- /dev/null +++ b/python/openai/openai/src/routers/chat_completions.py @@ -0,0 +1,178 @@ +import time +import uuid + +from fastapi import APIRouter, HTTPException, Request +from fastapi.responses import StreamingResponse +from src.schemas.openai import ( + ChatCompletionChoice, + ChatCompletionFinishReason, + ChatCompletionResponseMessage, + ChatCompletionStreamingResponseChoice, + ChatCompletionStreamResponseDelta, + CreateChatCompletionRequest, + CreateChatCompletionResponse, + CreateChatCompletionStreamResponse, + ObjectType, +) +from src.utils.triton import get_output, validate_triton_responses + +router = APIRouter() + + +def get_first_response_role(conversation, add_generation_prompt, default_role): + if add_generation_prompt: + return default_role + + return conversation[-1]["role"] + + +def streaming_chat_completion_response(request_id, created, model, role, responses): + # first chunk + choice = ChatCompletionStreamingResponseChoice( + index=0, + delta=ChatCompletionStreamResponseDelta( + role=role, content="", function_call=None + ), + logprobs=None, + finish_reason=None, + ) + chunk = CreateChatCompletionStreamResponse( + id=request_id, + choices=[choice], + created=created, + model=model, + system_fingerprint=None, + object=ObjectType.chat_completion_chunk, + ) + yield f"data: {chunk.json(exclude_unset=True)}\n\n" + + for response in responses: + text = get_output(response) + + choice = ChatCompletionStreamingResponseChoice( + index=0, + delta=ChatCompletionStreamResponseDelta( + role=None, content=text, function_call=None + ), + logprobs=None, + finish_reason=ChatCompletionFinishReason.stop if response.final else None, + ) + + chunk = CreateChatCompletionStreamResponse( + id=request_id, + choices=[choice], + created=created, + model=model, + system_fingerprint=None, + object=ObjectType.chat_completion_chunk, + ) + + yield f"data: {chunk.json(exclude_unset=True)}\n\n" + + yield "data: [DONE]\n\n" + + +@router.post( + "/v1/chat/completions", response_model=CreateChatCompletionResponse, tags=["Chat"] +) +def create_chat_completion( + request: CreateChatCompletionRequest, + raw_request: Request, +) -> CreateChatCompletionResponse | StreamingResponse: + """ + Creates a model response for the given chat conversation. + """ + + # TODO: Cleanup + print(f"[DEBUG] Available model metadata: {raw_request.app.models.keys()=}") + print(f"[DEBUG] Fetching model metadata for {request.model=}") + + model_metadatas = raw_request.app.models + if not model_metadatas: + raise HTTPException(status_code=400, detail="No known models") + + metadata = model_metadatas.get(request.model) + if not metadata: + raise HTTPException(status_code=400, detail=f"Unknown model: {request.model}") + + if not metadata.request_convert_fn: + raise HTTPException( + status_code=400, detail=f"Unknown request format for model: {request.model}" + ) + + if not metadata.tokenizer: + raise HTTPException(status_code=400, detail="Unknown tokenizer") + + if not metadata.backend: + raise HTTPException(status_code=400, detail="Unknown backend") + + triton_model = raw_request.app.server.model(request.model) + if request.model != triton_model.name: + raise HTTPException( + status_code=400, + detail=f"Mismatched model name: {request.model} != {triton_model.name}", + ) + + if request.n and request.n > 1: + raise HTTPException(status_code=400, detail="Only single choice is supported") + + if request.logit_bias is not None or request.logprobs: + raise HTTPException( + status_code=400, detail="logit bias and log probs not supported" + ) + + conversation = [ + {"role": str(message.role), "content": str(message.content)} + for message in request.messages + ] + + # NOTE: This behavior should be tested further + # TODO: Do these need to be exposed to the user? + add_generation_prompt = True + default_role = "assistant" + role = get_first_response_role(conversation, add_generation_prompt, default_role) + + prompt = metadata.tokenizer.apply_chat_template( + conversation=conversation, + tokenize=False, + add_generation_prompt=add_generation_prompt, + ) + + request_id = f"cmpl-{uuid.uuid1()}" + created = int(time.time()) + + responses = triton_model.infer( + metadata.request_convert_fn(triton_model, prompt, request) + ) + + if request.stream: + return StreamingResponse( + streaming_chat_completion_response( + request_id, created, request.model, role, responses + ), + media_type="text/event-stream", + ) + + # Response validation with decoupled models in mind + responses = list(responses) + validate_triton_responses(responses) + response = responses[0] + text = get_output(response) + + return CreateChatCompletionResponse( + id=request_id, + choices=[ + ChatCompletionChoice( + index=0, + message=ChatCompletionResponseMessage( + content=text, role=default_role, function_call=None + ), + logprobs=None, + finish_reason=ChatCompletionFinishReason.stop, + ) + ], + created=created, + model=request.model, + system_fingerprint=None, + object=ObjectType.chat_completion, + ) diff --git a/python/openai/openai/src/routers/completions.py b/python/openai/openai/src/routers/completions.py new file mode 100644 index 0000000000..5d1e9b12fa --- /dev/null +++ b/python/openai/openai/src/routers/completions.py @@ -0,0 +1,125 @@ +import time +import uuid + +from fastapi import APIRouter, HTTPException, Request +from fastapi.responses import StreamingResponse +from src.schemas.openai import ( + Choice, + CreateCompletionRequest, + CreateCompletionResponse, + FinishReason, + ObjectType, +) +from src.utils.triton import get_output, validate_triton_responses + +router = APIRouter() + + +def streaming_completion_response(request_id, created, model, responses): + for response in responses: + text = get_output(response) + + choice = Choice( + finish_reason=FinishReason.stop if response.final else None, + index=0, + logprobs=None, + text=text, + ) + response = CreateCompletionResponse( + id=request_id, + choices=[choice], + system_fingerprint=None, + object=ObjectType.text_completion, + created=created, + model=model, + ) + + yield f"data: {response.json(exclude_unset=True)}\n\n" + yield "data: [DONE]\n\n" + + +@router.post( + "/v1/completions", response_model=CreateCompletionResponse, tags=["Completions"] +) +def create_completion( + request: CreateCompletionRequest, raw_request: Request +) -> CreateCompletionResponse | StreamingResponse: + """ + Creates a completion for the provided prompt and parameters. + """ + + if not request.model: + raise Exception("Request must provide a valid 'model'") + + print(f"[DEBUG] Available model metadata: {raw_request.app.models.keys()=}") + print(f"[DEBUG] Fetching model metadata for {request.model=}") + metadata = raw_request.app.models.get(request.model) + + if not metadata: + raise HTTPException( + status_code=400, detail=f"Unknown model metadata for model: {request.model}" + ) + + if not metadata.request_convert_fn: + raise HTTPException( + status_code=400, detail=f"Unknown request format for model: {request.model}" + ) + + if request.suffix is not None: + raise HTTPException(status_code=400, detail="suffix is not currently supported") + + if request.model != metadata.name: + raise HTTPException(status_code=404, detail=f"Unknown model: {request.model}") + + if not request.prompt: + raise HTTPException(status_code=400, detail="prompt must be non-empty") + + # Currently only support single string as input + if not isinstance(request.prompt, str): + raise HTTPException( + status_code=400, detail="only single string input is supported" + ) + + if request.n and request.n > 1: + raise HTTPException(status_code=400, detail="Only single choice is supported") + + if request.logit_bias is not None or request.logprobs is not None: + raise HTTPException( + status_code=400, detail="logit bias and log probs not supported" + ) + + request_id = f"cmpl-{uuid.uuid1()}" + created = int(time.time()) + + triton_model = raw_request.app.server.model(request.model) + responses = triton_model.infer( + metadata.request_convert_fn(triton_model, request.prompt, request) + ) + if request.stream: + return StreamingResponse( + streaming_completion_response( + request_id, created, metadata.name, responses + ), + media_type="text/event-stream", + ) + + # Response validation with decoupled models in mind + responses = list(responses) + validate_triton_responses(responses) + response = responses[0] + text = get_output(response) + + choice = Choice( + finish_reason=FinishReason.stop, + index=0, + logprobs=None, + text=text, + ) + return CreateCompletionResponse( + id=request_id, + choices=[choice], + system_fingerprint=None, + object=ObjectType.text_completion, + created=created, + model=metadata.name, + ) diff --git a/python/openai/openai/src/routers/models.py b/python/openai/openai/src/routers/models.py new file mode 100644 index 0000000000..ff47000cfd --- /dev/null +++ b/python/openai/openai/src/routers/models.py @@ -0,0 +1,59 @@ +from fastapi import APIRouter, HTTPException, Request +from src.schemas.openai import ListModelsResponse, Model, ObjectType + +router = APIRouter() + +OWNED_BY = "Triton Inference Server" + + +@router.get("/v1/models", response_model=ListModelsResponse, tags=["Models"]) +def list_models(request: Request) -> ListModelsResponse: + """ + Lists the currently available models, and provides basic information about each one such as the owner and availability. + """ + model_metadatas = request.app.models + if not model_metadatas: + raise HTTPException(status_code=400, detail="No known models") + + model_list = [] + for model in model_metadatas: + metadata = model_metadatas[model] + if not metadata: + raise HTTPException( + status_code=400, detail=f"No metadata for model: {model}" + ) + + model_list.append( + Model( + id=metadata.name, + created=metadata.create_time, + object=ObjectType.model, + owned_by=OWNED_BY, + ), + ) + + return ListModelsResponse(object=ObjectType.list, data=model_list) + + +@router.get("/v1/models/{model_name}", response_model=Model, tags=["Models"]) +def retrieve_model(request: Request, model_name: str) -> Model: + """ + Retrieves a model instance, providing basic information about the model such as the owner and permissioning. + """ + model_metadatas = request.app.models + if not model_metadatas: + raise HTTPException(status_code=400, detail="No known models") + + model = model_metadatas.get(model_name) + if not model: + raise HTTPException(status_code=400, detail=f"Unknown model: {model_name}") + + if model_name == model.name: + return Model( + id=model.name, + created=model.create_time, + object=ObjectType.model, + owned_by=OWNED_BY, + ) + + raise HTTPException(status_code=404, detail=f"Unknown model: {model_name}") diff --git a/python/openai/openai/src/routers/observability.py b/python/openai/openai/src/routers/observability.py new file mode 100644 index 0000000000..98d506dab5 --- /dev/null +++ b/python/openai/openai/src/routers/observability.py @@ -0,0 +1,24 @@ +from fastapi import APIRouter, HTTPException, Request +from fastapi.responses import Response + +router = APIRouter() + + +@router.get("/metrics", tags=["Utilities"]) +def metrics(request: Request) -> str: + if not request.app.server or not request.app.server.live(): + raise HTTPException( + status_code=400, detail="Triton Inference Server is not live." + ) + + return request.app.server.metrics() + + +@router.get("/health", tags=["Utilities"]) +def health(request: Request) -> Response: + if not request.app.server or not request.app.server.live(): + raise HTTPException( + status_code=400, detail="Triton Inference Server is not live." + ) + + return Response(status_code=200) diff --git a/python/openai/openai/src/schemas/__init__.py b/python/openai/openai/src/schemas/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/openai/openai/src/schemas/openai.py b/python/openai/openai/src/schemas/openai.py new file mode 100644 index 0000000000..488dfda3bb --- /dev/null +++ b/python/openai/openai/src/schemas/openai.py @@ -0,0 +1,871 @@ +# generated by fastapi-codegen: +# filename: api-spec/openai_trimmed.yml +# timestamp: 2024-05-05T21:52:36+00:00 + +from __future__ import annotations + +from enum import Enum +from typing import Any, Dict, List, Optional, Union + +from pydantic import AnyUrl, BaseModel, ConfigDict, Field, RootModel, confloat, conint + + +class Error(BaseModel): + code: str + message: str + param: str + type: str + + +class ErrorResponse(BaseModel): + error: Error + + +class Object(Enum): + list = "list" + + +class DeleteModelResponse(BaseModel): + id: str + deleted: bool + object: str + + +class Model1(Enum): + gpt_3_5_turbo_instruct = "gpt-3.5-turbo-instruct" + davinci_002 = "davinci-002" + babbage_002 = "babbage-002" + + +class PromptItem(RootModel): + root: List[Any] + + +class CreateCompletionRequest(BaseModel): + model: Union[str, Model1] = Field( + ..., + description="ID of the model to use. You can use the [List models](/docs/api-reference/models/list) API to see all of your available models, or see our [Model overview](/docs/models/overview) for descriptions of them.\n", + ) + prompt: Union[str, List[str], List[int], List[PromptItem]] = Field( + ..., + description="The prompt(s) to generate completions for, encoded as a string, array of strings, array of tokens, or array of token arrays.\n\nNote that <|endoftext|> is the document separator that the model sees during training, so if a prompt is not specified the model will generate as if from the beginning of a new document.\n", + ) + best_of: Optional[conint(ge=0, le=20)] = Field( + 1, + description='Generates `best_of` completions server-side and returns the "best" (the one with the highest log probability per token). Results cannot be streamed.\n\nWhen used with `n`, `best_of` controls the number of candidate completions and `n` specifies how many to return – `best_of` must be greater than `n`.\n\n**Note:** Because this parameter generates many completions, it can quickly consume your token quota. Use carefully and ensure that you have reasonable settings for `max_tokens` and `stop`.\n', + ) + echo: Optional[bool] = Field( + False, description="Echo back the prompt in addition to the completion\n" + ) + frequency_penalty: Optional[confloat(ge=-2.0, le=2.0)] = Field( + 0, + description="Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.\n\n[See more information about frequency and presence penalties.](/docs/guides/text-generation/parameter-details)\n", + ) + logit_bias: Optional[Dict[str, int]] = Field( + None, + description='Modify the likelihood of specified tokens appearing in the completion.\n\nAccepts a JSON object that maps tokens (specified by their token ID in the GPT tokenizer) to an associated bias value from -100 to 100. You can use this [tokenizer tool](/tokenizer?view=bpe) to convert text to token IDs. Mathematically, the bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model, but values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should result in a ban or exclusive selection of the relevant token.\n\nAs an example, you can pass `{"50256": -100}` to prevent the <|endoftext|> token from being generated.\n', + ) + logprobs: Optional[conint(ge=0, le=5)] = Field( + None, + description="Include the log probabilities on the `logprobs` most likely output tokens, as well the chosen tokens. For example, if `logprobs` is 5, the API will return a list of the 5 most likely tokens. The API will always return the `logprob` of the sampled token, so there may be up to `logprobs+1` elements in the response.\n\nThe maximum value for `logprobs` is 5.\n", + ) + max_tokens: Optional[conint(ge=0)] = Field( + 16, + description="The maximum number of [tokens](/tokenizer) that can be generated in the completion.\n\nThe token count of your prompt plus `max_tokens` cannot exceed the model's context length. [Example Python code](https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken) for counting tokens.\n", + examples=[16], + ) + n: Optional[conint(ge=1, le=128)] = Field( + 1, + description="How many completions to generate for each prompt.\n\n**Note:** Because this parameter generates many completions, it can quickly consume your token quota. Use carefully and ensure that you have reasonable settings for `max_tokens` and `stop`.\n", + examples=[1], + ) + presence_penalty: Optional[confloat(ge=-2.0, le=2.0)] = Field( + 0, + description="Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.\n\n[See more information about frequency and presence penalties.](/docs/guides/text-generation/parameter-details)\n", + ) + seed: Optional[conint(ge=-9223372036854775808, le=9223372036854775807)] = Field( + None, + description="If specified, our system will make a best effort to sample deterministically, such that repeated requests with the same `seed` and parameters should return the same result.\n\nDeterminism is not guaranteed, and you should refer to the `system_fingerprint` response parameter to monitor changes in the backend.\n", + ) + stop: Optional[Union[str, List[str]]] = Field( + None, + description="Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence.\n", + ) + stream: Optional[bool] = Field( + False, + description="Whether to stream back partial progress. If set, tokens will be sent as data-only [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format) as they become available, with the stream terminated by a `data: [DONE]` message. [Example Python code](https://cookbook.openai.com/examples/how_to_stream_completions).\n", + ) + suffix: Optional[str] = Field( + None, + description="The suffix that comes after a completion of inserted text.\n\nThis parameter is only supported for `gpt-3.5-turbo-instruct`.\n", + examples=["test."], + ) + temperature: Optional[confloat(ge=0.0, le=2.0)] = Field( + 1, + description="What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.\n\nWe generally recommend altering this or `top_p` but not both.\n", + examples=[1], + ) + top_p: Optional[confloat(ge=0.0, le=1.0)] = Field( + 1, + description="An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.\n\nWe generally recommend altering this or `temperature` but not both.\n", + examples=[1], + ) + user: Optional[str] = Field( + None, + description="A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse. [Learn more](/docs/guides/safety-best-practices/end-user-ids).\n", + examples=["user-1234"], + ) + + +class FinishReason(Enum): + stop = "stop" + length = "length" + content_filter = "content_filter" + + +class Logprobs(BaseModel): + text_offset: Optional[List[int]] = None + token_logprobs: Optional[List[float]] = None + tokens: Optional[List[str]] = None + top_logprobs: Optional[List[Dict[str, float]]] = None + + +class Choice(BaseModel): + finish_reason: FinishReason | None = Field( + ..., + description="The reason the model stopped generating tokens. This will be `stop` if the model hit a natural stop point or a provided stop sequence,\n`length` if the maximum number of tokens specified in the request was reached,\nor `content_filter` if content was omitted due to a flag from our content filters.\n", + ) + index: int + logprobs: Logprobs | None + text: str + + +class Object1(Enum): + text_completion = "text_completion" + + +class Type(Enum): + image_url = "image_url" + + +class Detail(Enum): + auto = "auto" + low = "low" + high = "high" + + +class ImageUrl(BaseModel): + url: AnyUrl = Field( + ..., description="Either a URL of the image or the base64 encoded image data." + ) + detail: Optional[Detail] = Field( + "auto", + description="Specifies the detail level of the image. Learn more in the [Vision guide](/docs/guides/vision/low-or-high-fidelity-image-understanding).", + ) + + +class ChatCompletionRequestMessageContentPartImage(BaseModel): + type: Type = Field(..., description="The type of the content part.") + image_url: ImageUrl + + +class Type1(Enum): + text = "text" + + +class ChatCompletionRequestMessageContentPartText(BaseModel): + type: Type1 = Field(..., description="The type of the content part.") + text: str = Field(..., description="The text content.") + + +class Role(Enum): + system = "system" + + def __str__(self): + return self.name + + +class ChatCompletionRequestSystemMessage(BaseModel): + content: str = Field(..., description="The contents of the system message.") + role: Role = Field( + ..., description="The role of the messages author, in this case `system`." + ) + name: Optional[str] = Field( + None, + description="An optional name for the participant. Provides the model information to differentiate between participants of the same role.", + ) + + +class Role1(Enum): + user = "user" + + def __str__(self): + return self.name + + +class Role2(Enum): + assistant = "assistant" + + def __str__(self): + return self.name + + +class FunctionCall(BaseModel): + arguments: str = Field( + ..., + description="The arguments to call the function with, as generated by the model in JSON format. Note that the model does not always generate valid JSON, and may hallucinate parameters not defined by your function schema. Validate the arguments in your code before calling your function.", + ) + name: str = Field(..., description="The name of the function to call.") + + +class Role3(Enum): + tool = "tool" + + def __str__(self): + return self.name + + +class ChatCompletionRequestToolMessage(BaseModel): + role: Role3 = Field( + ..., description="The role of the messages author, in this case `tool`." + ) + content: str = Field(..., description="The contents of the tool message.") + tool_call_id: str = Field( + ..., description="Tool call that this message is responding to." + ) + + +class Role4(Enum): + function = "function" + + def __str__(self): + return self.name + + +class ChatCompletionRequestFunctionMessage(BaseModel): + role: Role4 = Field( + ..., description="The role of the messages author, in this case `function`." + ) + content: str = Field(..., description="The contents of the function message.") + name: str = Field(..., description="The name of the function to call.") + + +class FunctionParameters(BaseModel): + model_config = ConfigDict(extra="allow") + # class Config: + # # TODO: Remove + # #extra = Extra.allow + # extra = "allow" + + +class ChatCompletionFunctions(BaseModel): + description: Optional[str] = Field( + None, + description="A description of what the function does, used by the model to choose when and how to call the function.", + ) + name: str = Field( + ..., + description="The name of the function to be called. Must be a-z, A-Z, 0-9, or contain underscores and dashes, with a maximum length of 64.", + ) + parameters: Optional[FunctionParameters] = None + + +class ChatCompletionFunctionCallOption(BaseModel): + name: str = Field(..., description="The name of the function to call.") + + +class Type2(Enum): + function = "function" + + +class FunctionObject(BaseModel): + description: Optional[str] = Field( + None, + description="A description of what the function does, used by the model to choose when and how to call the function.", + ) + name: str = Field( + ..., + description="The name of the function to be called. Must be a-z, A-Z, 0-9, or contain underscores and dashes, with a maximum length of 64.", + ) + parameters: Optional[FunctionParameters] = None + + +class ChatCompletionToolChoiceOption1(Enum): + none = "none" + auto = "auto" + required = "required" + + +class Function(BaseModel): + name: str = Field(..., description="The name of the function to call.") + + +class ChatCompletionNamedToolChoice(BaseModel): + type: Type2 = Field( + ..., + description="The type of the tool. Currently, only `function` is supported.", + ) + function: Function + + +class Function1(BaseModel): + name: str = Field(..., description="The name of the function to call.") + arguments: str = Field( + ..., + description="The arguments to call the function with, as generated by the model in JSON format. Note that the model does not always generate valid JSON, and may hallucinate parameters not defined by your function schema. Validate the arguments in your code before calling your function.", + ) + + +class ChatCompletionMessageToolCall(BaseModel): + id: str = Field(..., description="The ID of the tool call.") + type: Type2 = Field( + ..., + description="The type of the tool. Currently, only `function` is supported.", + ) + function: Function1 = Field(..., description="The function that the model called.") + + +class Function2(BaseModel): + name: Optional[str] = Field(None, description="The name of the function to call.") + arguments: Optional[str] = Field( + None, + description="The arguments to call the function with, as generated by the model in JSON format. Note that the model does not always generate valid JSON, and may hallucinate parameters not defined by your function schema. Validate the arguments in your code before calling your function.", + ) + + +class ChatCompletionMessageToolCallChunk(BaseModel): + index: int + id: Optional[str] = Field(None, description="The ID of the tool call.") + type: Optional[Type2] = Field( + None, + description="The type of the tool. Currently, only `function` is supported.", + ) + function: Optional[Function2] = None + + +class ChatCompletionRole(Enum): + system = "system" + user = "user" + assistant = "assistant" + tool = "tool" + function = "function" + + +class Role5(Enum): + assistant = "assistant" + + def __str__(self): + return self.name + + +class FunctionCall2(BaseModel): + arguments: Optional[str] = Field( + None, + description="The arguments to call the function with, as generated by the model in JSON format. Note that the model does not always generate valid JSON, and may hallucinate parameters not defined by your function schema. Validate the arguments in your code before calling your function.", + ) + name: Optional[str] = Field(None, description="The name of the function to call.") + + +class Role6(Enum): + system = "system" + user = "user" + assistant = "assistant" + tool = "tool" + + def __str__(self): + return self.name + + +class ChatCompletionStreamResponseDelta(BaseModel): + content: Optional[str] = Field( + None, description="The contents of the chunk message." + ) + function_call: Optional[FunctionCall2] = Field( + None, + description="Deprecated and replaced by `tool_calls`. The name and arguments of a function that should be called, as generated by the model.", + ) + tool_calls: Optional[List[ChatCompletionMessageToolCallChunk]] = None + role: Optional[str] = Field( + None, description="The role of the author of this message." + ) + + +class Model2(Enum): + gpt_4_turbo = "gpt-4-turbo" + gpt_4_turbo_2024_04_09 = "gpt-4-turbo-2024-04-09" + gpt_4_0125_preview = "gpt-4-0125-preview" + gpt_4_turbo_preview = "gpt-4-turbo-preview" + gpt_4_1106_preview = "gpt-4-1106-preview" + gpt_4_vision_preview = "gpt-4-vision-preview" + gpt_4 = "gpt-4" + gpt_4_0314 = "gpt-4-0314" + gpt_4_0613 = "gpt-4-0613" + gpt_4_32k = "gpt-4-32k" + gpt_4_32k_0314 = "gpt-4-32k-0314" + gpt_4_32k_0613 = "gpt-4-32k-0613" + gpt_3_5_turbo = "gpt-3.5-turbo" + gpt_3_5_turbo_16k = "gpt-3.5-turbo-16k" + gpt_3_5_turbo_0301 = "gpt-3.5-turbo-0301" + gpt_3_5_turbo_0613 = "gpt-3.5-turbo-0613" + gpt_3_5_turbo_1106 = "gpt-3.5-turbo-1106" + gpt_3_5_turbo_0125 = "gpt-3.5-turbo-0125" + gpt_3_5_turbo_16k_0613 = "gpt-3.5-turbo-16k-0613" + + +class Type6(Enum): + text = "text" + json_object = "json_object" + + +class ResponseFormat(BaseModel): + type: Optional[Type6] = Field( + "text", + description="Must be one of `text` or `json_object`.", + examples=["json_object"], + ) + + +class FunctionCall3(Enum): + none = "none" + auto = "auto" + + +class ChatCompletionFinishReason(Enum): + stop = "stop" + length = "length" + tool_calls = "tool_calls" + content_filter = "content_filter" + function_call = "function_call" + + +class Object2(Enum): + chat_completion = "chat.completion" + + +class FinishReason2(Enum): + stop = "stop" + length = "length" + function_call = "function_call" + content_filter = "content_filter" + + +class TopLogprob(BaseModel): + token: str = Field(..., description="The token.") + logprob: float = Field( + ..., + description="The log probability of this token, if it is within the top 20 most likely tokens. Otherwise, the value `-9999.0` is used to signify that the token is very unlikely.", + ) + bytes: List[int] = Field( + ..., + description="A list of integers representing the UTF-8 bytes representation of the token. Useful in instances where characters are represented by multiple tokens and their byte representations must be combined to generate the correct text representation. Can be `null` if there is no bytes representation for the token.", + ) + + +class ChatCompletionTokenLogprob(BaseModel): + token: str = Field(..., description="The token.") + logprob: float = Field( + ..., + description="The log probability of this token, if it is within the top 20 most likely tokens. Otherwise, the value `-9999.0` is used to signify that the token is very unlikely.", + ) + bytes: List[int] = Field( + ..., + description="A list of integers representing the UTF-8 bytes representation of the token. Useful in instances where characters are represented by multiple tokens and their byte representations must be combined to generate the correct text representation. Can be `null` if there is no bytes representation for the token.", + ) + top_logprobs: List[TopLogprob] = Field( + ..., + description="List of the most likely tokens and their log probability, at this token position. In rare cases, there may be fewer than the number of requested `top_logprobs` returned.", + ) + + +class Logprobs2(BaseModel): + content: List[ChatCompletionTokenLogprob] = Field( + ..., + description="A list of message content tokens with log probability information.", + ) + + +class ChatCompletionFinishReason(Enum): + stop = "stop" + length = "length" + tool_calls = "tool_calls" + content_filter = "content_filter" + function_call = "function_call" + + +class ChatCompletionStreamingResponseChoice(BaseModel): + delta: ChatCompletionStreamResponseDelta + logprobs: Optional[Logprobs2] = Field( + None, description="Log probability information for the choice." + ) + finish_reason: ChatCompletionFinishReason | None = Field( + ..., + description="The reason the model stopped generating tokens. This will be `stop` if the model hit a natural stop point or a provided stop sequence,\n`length` if the maximum number of tokens specified in the request was reached,\n`content_filter` if content was omitted due to a flag from our content filters,\n`tool_calls` if the model called a tool, or `function_call` (deprecated) if the model called a function.\n", + ) + index: int = Field( + ..., description="The index of the choice in the list of choices." + ) + + +class Object4(Enum): + chat_completion_chunk = "chat.completion.chunk" + + +class CreateChatCompletionStreamResponse(BaseModel): + id: str = Field( + ..., + description="A unique identifier for the chat completion. Each chunk has the same ID.", + ) + choices: List[ChatCompletionStreamingResponseChoice] = Field( + ..., + description="A list of chat completion choices. Can be more than one if `n` is greater than 1.", + ) + created: int = Field( + ..., + description="The Unix timestamp (in seconds) of when the chat completion was created. Each chunk has the same timestamp.", + ) + model: str = Field(..., description="The model to generate the completion.") + system_fingerprint: Optional[str] = Field( + None, + description="This fingerprint represents the backend configuration that the model runs with.\nCan be used in conjunction with the `seed` request parameter to understand when backend changes have been made that might impact determinism.\n", + ) + object: Object4 = Field( + ..., description="The object type, which is always `chat.completion.chunk`." + ) + + +class CreateChatCompletionImageResponse(BaseModel): + pass + + +class Object5(Enum): + model = "model" + + +class Model(BaseModel): + id: str = Field( + ..., + description="The model identifier, which can be referenced in the API endpoints.", + ) + created: int = Field( + ..., description="The Unix timestamp (in seconds) when the model was created." + ) + object: Object5 = Field( + ..., description='The object type, which is always "model".' + ) + owned_by: str = Field(..., description="The organization that owns the model.") + + +class CompletionUsage(BaseModel): + completion_tokens: int = Field( + ..., description="Number of tokens in the generated completion." + ) + prompt_tokens: int = Field(..., description="Number of tokens in the prompt.") + total_tokens: int = Field( + ..., + description="Total number of tokens used in the request (prompt + completion).", + ) + + +class Event(Enum): + error = "error" + + +class ErrorEvent(BaseModel): + event: Event + data: Error + + +class Event1(Enum): + done = "done" + + +class Data(Enum): + field_DONE_ = "[DONE]" + + +class DoneEvent(BaseModel): + event: Event1 + data: Data + + +class ListModelsResponse(BaseModel): + object: Object + data: List[Model] + + +class CreateCompletionResponse(BaseModel): + id: str = Field(..., description="A unique identifier for the completion.") + choices: List[Choice] = Field( + ..., + description="The list of completion choices the model generated for the input prompt.", + ) + created: int = Field( + ..., + description="The Unix timestamp (in seconds) of when the completion was created.", + ) + model: str = Field(..., description="The model used for completion.") + system_fingerprint: Optional[str] = Field( + None, + description="This fingerprint represents the backend configuration that the model runs with.\n\nCan be used in conjunction with the `seed` request parameter to understand when backend changes have been made that might impact determinism.\n", + ) + object: Object1 = Field( + ..., description='The object type, which is always "text_completion"' + ) + usage: Optional[CompletionUsage] = None + + +class ChatCompletionRequestMessageContentPart(RootModel): + root: Union[ + ChatCompletionRequestMessageContentPartText, + ChatCompletionRequestMessageContentPartImage, + ] + + +class ChatCompletionRequestUserMessage(BaseModel): + content: Union[str, List[ChatCompletionRequestMessageContentPart]] = Field( + ..., description="The contents of the user message.\n" + ) + role: Role1 = Field( + ..., description="The role of the messages author, in this case `user`." + ) + name: Optional[str] = Field( + None, + description="An optional name for the participant. Provides the model information to differentiate between participants of the same role.", + ) + + +class ChatCompletionTool(BaseModel): + type: Type2 = Field( + ..., + description="The type of the tool. Currently, only `function` is supported.", + ) + function: FunctionObject + + +class ChatCompletionToolChoiceOption(RootModel): + root: Union[ChatCompletionToolChoiceOption1, ChatCompletionNamedToolChoice] = Field( + ..., + description='Controls which (if any) tool is called by the model.\n`none` means the model will not call any tool and instead generates a message.\n`auto` means the model can pick between generating a message or calling one or more tools.\n`required` means the model must call one or more tools.\nSpecifying a particular tool via `{"type": "function", "function": {"name": "my_function"}}` forces the model to call that tool.\n\n`none` is the default when no tools are present. `auto` is the default if tools are present.\n', + ) + + +class ChatCompletionMessageToolCalls(RootModel): + root: List[ChatCompletionMessageToolCall] = Field( + ..., + description="The tool calls generated by the model, such as function calls.", + ) + + +class ChatCompletionResponseMessage(BaseModel): + content: str = Field(..., description="The contents of the message.") + tool_calls: Optional[ChatCompletionMessageToolCalls] = None + role: str = Field(..., description="The role of the author of this message.") + function_call: Optional[FunctionCall] = Field( + None, + description="Deprecated and replaced by `tool_calls`. The name and arguments of a function that should be called, as generated by the model.", + ) + + +class ChatCompletionChoice(BaseModel): + finish_reason: ChatCompletionFinishReason = Field( + ..., + description="The reason the model stopped generating tokens. This will be `stop` if the model hit a natural stop point or a provided stop sequence,\n`length` if the maximum number of tokens specified in the request was reached,\n`content_filter` if content was omitted due to a flag from our content filters,\n`tool_calls` if the model called a tool, or `function_call` (deprecated) if the model called a function.\n", + ) + index: int = Field( + ..., description="The index of the choice in the list of choices." + ) + message: ChatCompletionResponseMessage + logprobs: Logprobs2 | None = Field( + ..., description="Log probability information for the choice." + ) + + +class CreateChatCompletionResponse(BaseModel): + id: str = Field(..., description="A unique identifier for the chat completion.") + choices: List[ChatCompletionChoice] = Field( + ..., + description="A list of chat completion choices. Can be more than one if `n` is greater than 1.", + ) + created: int = Field( + ..., + description="The Unix timestamp (in seconds) of when the chat completion was created.", + ) + model: str = Field(..., description="The model used for the chat completion.") + system_fingerprint: Optional[str] = Field( + None, + description="This fingerprint represents the backend configuration that the model runs with.\n\nCan be used in conjunction with the `seed` request parameter to understand when backend changes have been made that might impact determinism.\n", + ) + object: Object2 = Field( + ..., description="The object type, which is always `chat.completion`." + ) + usage: Optional[CompletionUsage] = None + + +class Choice2(BaseModel): + finish_reason: FinishReason2 = Field( + ..., + description="The reason the model stopped generating tokens. This will be `stop` if the model hit a natural stop point or a provided stop sequence, `length` if the maximum number of tokens specified in the request was reached, `content_filter` if content was omitted due to a flag from our content filters, or `function_call` if the model called a function.\n", + ) + index: int = Field( + ..., description="The index of the choice in the list of choices." + ) + message: ChatCompletionResponseMessage + + +class CreateChatCompletionFunctionResponse(BaseModel): + id: str = Field(..., description="A unique identifier for the chat completion.") + choices: List[Choice2] = Field( + ..., + description="A list of chat completion choices. Can be more than one if `n` is greater than 1.", + ) + created: int = Field( + ..., + description="The Unix timestamp (in seconds) of when the chat completion was created.", + ) + model: str = Field(..., description="The model used for the chat completion.") + system_fingerprint: Optional[str] = Field( + None, + description="This fingerprint represents the backend configuration that the model runs with.\n\nCan be used in conjunction with the `seed` request parameter to understand when backend changes have been made that might impact determinism.\n", + ) + object: Object2 = Field( + ..., description="The object type, which is always `chat.completion`." + ) + usage: Optional[CompletionUsage] = None + + +class ChatCompletionRequestAssistantMessage(BaseModel): + content: Optional[str] = Field( + None, + description="The contents of the assistant message. Required unless `tool_calls` or `function_call` is specified.\n", + ) + role: Role2 = Field( + ..., description="The role of the messages author, in this case `assistant`." + ) + name: Optional[str] = Field( + None, + description="An optional name for the participant. Provides the model information to differentiate between participants of the same role.", + ) + tool_calls: Optional[ChatCompletionMessageToolCalls] = None + function_call: Optional[FunctionCall] = Field( + None, + description="Deprecated and replaced by `tool_calls`. The name and arguments of a function that should be called, as generated by the model.", + ) + + +class ChatCompletionRequestMessage(RootModel): + root: Union[ + ChatCompletionRequestSystemMessage, + ChatCompletionRequestUserMessage, + ChatCompletionRequestAssistantMessage, + ChatCompletionRequestToolMessage, + ChatCompletionRequestFunctionMessage, + ] + + @property + def role(self): + return self.root.role + + @property + def content(self): + return self.root.content + + +class CreateChatCompletionRequest(BaseModel): + messages: List[ChatCompletionRequestMessage] = Field( + ..., + description="A list of messages comprising the conversation so far. [Example Python code](https://cookbook.openai.com/examples/how_to_format_inputs_to_chatgpt_models).", + min_length=1, + ) + model: Union[str, Model2] = Field( + ..., + description="ID of the model to use. See the [model endpoint compatibility](/docs/models/model-endpoint-compatibility) table for details on which models work with the Chat API.", + examples=["gpt-4-turbo"], + ) + frequency_penalty: Optional[confloat(ge=-2.0, le=2.0)] = Field( + 0, + description="Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.\n\n[See more information about frequency and presence penalties.](/docs/guides/text-generation/parameter-details)\n", + ) + logit_bias: Optional[Dict[str, int]] = Field( + None, + description="Modify the likelihood of specified tokens appearing in the completion.\n\nAccepts a JSON object that maps tokens (specified by their token ID in the tokenizer) to an associated bias value from -100 to 100. Mathematically, the bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model, but values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should result in a ban or exclusive selection of the relevant token.\n", + ) + logprobs: Optional[bool] = Field( + False, + description="Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the `content` of `message`.", + ) + top_logprobs: Optional[conint(ge=0, le=20)] = Field( + None, + description="An integer between 0 and 20 specifying the number of most likely tokens to return at each token position, each with an associated log probability. `logprobs` must be set to `true` if this parameter is used.", + ) + max_tokens: Optional[conint(ge=0)] = Field( + 16, + description="The maximum number of [tokens](/tokenizer) that can be generated in the chat completion.\n\nThe total length of input tokens and generated tokens is limited by the model's context length. [Example Python code](https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken) for counting tokens.\n", + ) + n: Optional[conint(ge=1, le=128)] = Field( + 1, + description="How many chat completion choices to generate for each input message. Note that you will be charged based on the number of generated tokens across all of the choices. Keep `n` as `1` to minimize costs.", + examples=[1], + ) + presence_penalty: Optional[confloat(ge=-2.0, le=2.0)] = Field( + 0, + description="Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.\n\n[See more information about frequency and presence penalties.](/docs/guides/text-generation/parameter-details)\n", + ) + response_format: Optional[ResponseFormat] = Field( + None, + description='An object specifying the format that the model must output. Compatible with [GPT-4 Turbo](/docs/models/gpt-4-and-gpt-4-turbo) and all GPT-3.5 Turbo models newer than `gpt-3.5-turbo-1106`.\n\nSetting to `{ "type": "json_object" }` enables JSON mode, which guarantees the message the model generates is valid JSON.\n\n**Important:** when using JSON mode, you **must** also instruct the model to produce JSON yourself via a system or user message. Without this, the model may generate an unending stream of whitespace until the generation reaches the token limit, resulting in a long-running and seemingly "stuck" request. Also note that the message content may be partially cut off if `finish_reason="length"`, which indicates the generation exceeded `max_tokens` or the conversation exceeded the max context length.\n', + ) + seed: Optional[conint(ge=-9223372036854775808, le=9223372036854775807)] = Field( + None, + description="This feature is in Beta.\nIf specified, our system will make a best effort to sample deterministically, such that repeated requests with the same `seed` and parameters should return the same result.\nDeterminism is not guaranteed, and you should refer to the `system_fingerprint` response parameter to monitor changes in the backend.\n", + ) + stop: Optional[Union[str, List[str]]] = Field( + None, + description="Up to 4 sequences where the API will stop generating further tokens.\n", + ) + stream: Optional[bool] = Field( + False, + description="If set, partial message deltas will be sent, like in ChatGPT. Tokens will be sent as data-only [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format) as they become available, with the stream terminated by a `data: [DONE]` message. [Example Python code](https://cookbook.openai.com/examples/how_to_stream_completions).\n", + ) + temperature: Optional[confloat(ge=0.0, le=2.0)] = Field( + 0.7, + description="What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.\n\nWe generally recommend altering this or `top_p` but not both.\n", + examples=[1], + ) + top_p: Optional[confloat(ge=0.0, le=1.0)] = Field( + 1, + description="An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.\n\nWe generally recommend altering this or `temperature` but not both.\n", + examples=[1], + ) + tools: Optional[List[ChatCompletionTool]] = Field( + None, + description="A list of tools the model may call. Currently, only functions are supported as a tool. Use this to provide a list of functions the model may generate JSON inputs for. A max of 128 functions are supported.\n", + ) + tool_choice: Optional[ChatCompletionToolChoiceOption] = None + user: Optional[str] = Field( + None, + description="A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse. [Learn more](/docs/guides/safety-best-practices/end-user-ids).\n", + examples=["user-1234"], + ) + function_call: Optional[ + Union[FunctionCall3, ChatCompletionFunctionCallOption] + ] = Field( + None, + description='Deprecated in favor of `tool_choice`.\n\nControls which (if any) function is called by the model.\n`none` means the model will not call a function and instead generates a message.\n`auto` means the model can pick between generating a message or calling a function.\nSpecifying a particular function via `{"name": "my_function"}` forces the model to call that function.\n\n`none` is the default when no functions are present. `auto` is the default if functions are present.\n', + ) + functions: Optional[List[ChatCompletionFunctions]] = Field( + None, + description="Deprecated in favor of `tools`.\n\nA list of functions the model may generate JSON inputs for.\n", + max_length=128, + min_length=1, + ) + + +# Additional Aliases for Convenience + + +class ObjectType: + model = Object5.model + list = Object.list + text_completion = Object1.text_completion + chat_completion_chunk = Object4.chat_completion_chunk + chat_completion = Object2.chat_completion diff --git a/python/openai/openai/src/utils/__init__.py b/python/openai/openai/src/utils/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/openai/openai/src/utils/tokenizer.py b/python/openai/openai/src/utils/tokenizer.py new file mode 100644 index 0000000000..a60783a5f9 --- /dev/null +++ b/python/openai/openai/src/utils/tokenizer.py @@ -0,0 +1,77 @@ +from typing import Optional, Union + +from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast + + +def get_cached_tokenizer( + tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast] +) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: + """Get tokenizer with cached properties. + + This will patch the tokenizer object in place. + + By default, transformers will recompute multiple tokenizer properties + each time they are called, leading to a significant slowdown. This + function caches these properties for faster access.""" + + tokenizer_all_special_ids = set(tokenizer.all_special_ids) + tokenizer_all_special_tokens_extended = tokenizer.all_special_tokens_extended + tokenizer_all_special_tokens = set(tokenizer.all_special_tokens) + tokenizer_len = len(tokenizer) + + class CachedTokenizer(tokenizer.__class__): # type: ignore + @property + def all_special_ids(self): + return tokenizer_all_special_ids + + @property + def all_special_tokens(self): + return tokenizer_all_special_tokens + + @property + def all_special_tokens_extended(self): + return tokenizer_all_special_tokens_extended + + def __len__(self): + return tokenizer_len + + CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}" + + tokenizer.__class__ = CachedTokenizer + return tokenizer + + +def get_tokenizer( + tokenizer_name: str, + *args, + tokenizer_mode: str = "auto", + trust_remote_code: bool = False, + tokenizer_revision: Optional[str] = None, + download_dir: Optional[str] = None, + **kwargs, +) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: + """Gets a tokenizer for the given model name via Huggingface/modelscope.""" + if tokenizer_mode == "slow": + if kwargs.get("use_fast", False): + raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.") + kwargs["use_fast"] = False + + try: + tokenizer = AutoTokenizer.from_pretrained( + tokenizer_name, + *args, + trust_remote_code=trust_remote_code, + tokenizer_revision=tokenizer_revision, + **kwargs, + ) + except ValueError as e: + raise e + except AttributeError as e: + raise e + + if not isinstance(tokenizer, PreTrainedTokenizerFast): + print( + "Using a slow tokenizer. This might cause a significant " + "slowdown. Consider using a fast tokenizer instead." + ) + return get_cached_tokenizer(tokenizer) diff --git a/python/openai/openai/src/utils/triton.py b/python/openai/openai/src/utils/triton.py new file mode 100644 index 0000000000..42a92fa34d --- /dev/null +++ b/python/openai/openai/src/utils/triton.py @@ -0,0 +1,219 @@ +import os +import time +import typing +from dataclasses import dataclass + +import numpy as np +import tritonserver +from fastapi import HTTPException +from src.schemas.openai import CreateChatCompletionRequest, CreateCompletionRequest +from src.utils.tokenizer import get_tokenizer + +# TODO: Refactor +# NOTE: Allow python backend for testing purposes +SUPPORTED_BACKENDS: set = {"vllm", "tensorrtllm", "python"} +LLM_BACKENDS: set = {"vllm", "tensorrtllm"} + + +# TODO: pydantic validation? +@dataclass +class TritonModelMetadata: + # Name used in Triton model repository + name: str + # Name of backend used by Triton + backend: str + # Triton model object handle + model: tritonserver.Model + # TODO: Address typing + tokenizer: typing.Optional[typing.Any] + # Time that model was loaded by Triton + create_time: int + # TODO: Address typing + request_convert_fn: typing.Optional[typing.Any] + + +def determine_request_format(backend): + # Request conversion from OpenAI format to backend-specific format + if backend == "vllm": + request_convert_fn = create_vllm_inference_request + # Python included to support TRT-LLM BLS model and TRT-LLM python runtime + elif backend in ["tensorrtllm", "python"]: + request_convert_fn = create_trtllm_inference_request + else: + request_convert_fn = None + + return request_convert_fn + + +# TODO: Refactor: +# NOTE: We need to figure out a few things while looking at the models in the +# triton model repository. +# 1. Which model should we interact with when sending requests to Triton core? +# a. For a single model, this is trivial, and would support any backend. +# b. For TRT-LLM, this should be 'ensemble' or 'tensorrt_llm_bls' following +# TRT-LLM defaults/examples. However, this could also be renamed by the user +# to have a more intuitive front-facing name, such as "llama3-8b". Note that +# TRT-LLM pipelines produced by the Triton CLI will generally be renamed like +# this. FIXME: This is a relatively fragile flow and should be improved. +# 2. Which tokenizer to use for things like applying a chat template or making +# a tool/function call. These are primarily relevant for the /chat/completions +# endpoint, but not the /completions endpoint. +# - For now, require user-defined TOKENIZER for simplicity. +# 3. Which inputs/outputs/parameters should be set when creating the underlying +# triton inference request? The inference request fields required will differ +# for vLLM, TRT-LLM, and user-defined models like a custom python model. So we +# need to know how to correctly translate the OpenAI schema parameters to +# a triton inference request. +# - For now, we will look for either vllm or trtllm in list of loaded backends, +# and we consider python==trtllm for now due to possibility of python runtime. +# We may want to consider using Triton's "runtime" config field for this for +# easier detection instead. +def load_models(server): + model_metadatas = [] + backends = [] + + # TODO: Support tokenizers more generically or custom tokenizers, possibly + # by looking for tokenizer.json in a pre-specified location? + tokenizer = None + tokenizer_model = os.environ.get("TOKENIZER") + if tokenizer_model: + print(f"Using env var TOKENIZER={tokenizer_model} to determine the tokenizer") + tokenizer = get_tokenizer(tokenizer_model) + + models = [] + backends = [] + names = [] + # Load all triton models and gather the respective backends of each + for name, version in server.models().keys(): + # TODO: Why skip known version? Already loaded? + if version != -1: + continue + + model = server.load(name) + backend = model.config()["backend"] + + names.append(name) + models.append(model) + backends.append(backend) + print(f"Loaded: {name=}, {backend=}, tokenizer={tokenizer_model}") + + create_time = int(time.time()) + + # One tokenizer, convert function, and creation time for all loaded models. + # NOTE: This doesn't currently support having both a vLLM and TRT-LLM + # model loaded at the same time. + for name, model, backend in zip(names, models, backends): + metadata = TritonModelMetadata( + name=name, + backend=backend, + model=model, + tokenizer=tokenizer, + create_time=create_time, + request_convert_fn=determine_request_format(backend), + ) + model_metadatas.append(metadata) + + return model_metadatas + + +def init_tritonserver(): + model_repository = os.environ.get( + "TRITON_MODEL_REPOSITORY", "/opt/tritonserver/models" + ) + log_verbose_level = int(os.environ.get("TRITON_LOG_VERBOSE_LEVEL", "0")) + + print("Starting Triton Server Core...") + server = tritonserver.Server( + model_repository=model_repository, + log_verbose=log_verbose_level, + log_info=True, + log_warn=True, + log_error=True, + model_control_mode=tritonserver.ModelControlMode.EXPLICIT, + ).start(wait_until_ready=True) + + print("Loading Models...") + metadatas = load_models(server) + return server, metadatas + + +def get_output(response): + if "text_output" in response.outputs: + try: + return response.outputs["text_output"].to_string_array()[0] + except: + return str(response.outputs["text_output"].to_bytes_array()[0]) + return "" + + +def validate_triton_responses(responses): + num_responses = len(responses) + if num_responses == 1 and responses[0].final != True: + raise HTTPException( + status_code=400, + detail="Unexpected internal error with incorrect response flags", + ) + if num_responses == 2 and responses[-1].final != True: + raise HTTPException( + status_code=400, + detail="Unexpected internal error with incorrect response flags", + ) + if num_responses > 2: + raise HTTPException( + status_code=400, + detail=f"Unexpected number of responses: {num_responses}, expected 1.", + ) + + +def create_vllm_inference_request( + model, prompt, request: CreateChatCompletionRequest | CreateCompletionRequest +): + inputs = {} + excludes = {"model", "stream", "messages", "prompt", "echo"} + + # NOTE: The exclude_none is important, as internals may not support + # values of NoneType at this time. + sampling_parameters = request.model_dump( + exclude=excludes, + exclude_none=True, + ) + print(f"[DEBUG] {sampling_parameters=}") + + inputs["text_input"] = [prompt] + inputs["stream"] = [request.stream] + exclude_input_in_output = True + echo = getattr(request, "echo", None) + if echo: + exclude_input_in_output = not echo + inputs["exclude_input_in_output"] = [exclude_input_in_output] + + print(f"[DEBUG] Triton Inference Request {inputs=}") + return model.create_request(inputs=inputs, parameters=sampling_parameters) + + +def create_trtllm_inference_request( + model, prompt, request: CreateChatCompletionRequest | CreateCompletionRequest +): + inputs = {} + inputs["text_input"] = [[prompt]] + inputs["stream"] = [[request.stream]] + if request.max_tokens: + inputs["max_tokens"] = np.int32([[request.max_tokens]]) + if request.stop: + if isinstance(request.stop, str): + request.stop = [request.stop] + inputs["stop_words"] = [request.stop] + # Check "is not None" specifically, because values of zero are valid. + if request.top_p is not None: + inputs["top_p"] = np.float32([[request.top_p]]) + if request.frequency_penalty is not None: + inputs["frequency_penalty"] = np.float32([[request.frequency_penalty]]) + if request.presence_penalty is not None: + inputs["presence_penalty"] = np.float32([[request.presence_penalty]]) + if request.seed is not None: + inputs["random_seed"] = np.uint64([[request.seed]]) + if request.temperature is not None: + inputs["temperature"] = np.float32([[request.temperature]]) + + print(f"[DEBUG] Triton Inference Request {inputs=}") + return model.create_request(inputs=inputs) diff --git a/python/openai/openai/tests/__init__.py b/python/openai/openai/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/openai/openai/tests/conftest.py b/python/openai/openai/tests/conftest.py new file mode 100644 index 0000000000..da6301fa04 --- /dev/null +++ b/python/openai/openai/tests/conftest.py @@ -0,0 +1,75 @@ +from pathlib import Path + +import pytest +from fastapi.testclient import TestClient +from tests.utils import OpenAIServer, setup_fastapi_app + +### TEST ENVIRONMENT SETUP ### +TEST_BACKEND = "" +TEST_MODEL = "" +TEST_PROMPT = "What is machine learning?" +TEST_MESSAGES = [{"role": "user", "content": TEST_PROMPT}] +TEST_TOKENIZER = "meta-llama/Meta-Llama-3.1-8B-Instruct" +try: + import vllm as _ + + TEST_BACKEND = "vllm" + TEST_MODEL = "llama-3.1-8b-instruct" +except ImportError: + pass + +try: + import tensorrt_llm as _ + + TEST_BACKEND = "tensorrtllm" + TEST_MODEL = "tensorrt_llm_bls" +except ImportError: + pass + +if not TEST_BACKEND or not TEST_MODEL: + raise Exception("Unknown test environment") +### + + +# NOTE: OpenAI client requires actual server running, and won't work +# with the FastAPI TestClient. Run the server at module scope to run +# only once for all the tests below. +@pytest.fixture(scope="module") +def server(): + model_repository = Path(__file__).parent / f"{TEST_BACKEND}_models" + args = ["--model-repository", model_repository, "--tokenizer", TEST_TOKENIZER] + + with OpenAIServer(args) as openai_server: + yield openai_server + + +# NOTE: The FastAPI TestClient acts like a server and triggers the FastAPI app +# lifespan startup/shutdown, but does not actually expose the network port to interact +# with arbitrary clients - you must use the TestClient returned to interact with +# the "server" when "starting the server" via TestClient. +@pytest.fixture(scope="class") +def fastapi_client_class_scope(): + model_repository = str(Path(__file__).parent / f"{TEST_BACKEND}_models") + app = setup_fastapi_app(tokenizer=TEST_TOKENIZER, model_repository=model_repository) + with TestClient(app) as test_client: + yield test_client + + +@pytest.fixture +def model(): + return TEST_MODEL + + +@pytest.fixture +def backend(): + return TEST_BACKEND + + +@pytest.fixture +def prompt(): + return TEST_PROMPT + + +@pytest.fixture +def messages(): + return TEST_MESSAGES diff --git a/python/openai/openai/tests/tensorrtllm_models/ensemble/1/.tmp b/python/openai/openai/tests/tensorrtllm_models/ensemble/1/.tmp new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/openai/openai/tests/tensorrtllm_models/ensemble/config.pbtxt b/python/openai/openai/tests/tensorrtllm_models/ensemble/config.pbtxt new file mode 100644 index 0000000000..b82990446d --- /dev/null +++ b/python/openai/openai/tests/tensorrtllm_models/ensemble/config.pbtxt @@ -0,0 +1,470 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "ensemble" +platform: "ensemble" +max_batch_size: 64 +input [ + { + name: "text_input" + data_type: TYPE_STRING + dims: [ -1 ] + }, + { + name: "decoder_text_input" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true + }, + { + name: "max_tokens" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "bad_words" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true + }, + { + name: "stop_words" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true + }, + { + name: "end_id" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + }, + { + name: "pad_id" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + }, + { + name: "top_k" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + }, + { + name: "top_p" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "temperature" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "length_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "repetition_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "min_length" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + }, + { + name: "presence_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "frequency_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "random_seed" + data_type: TYPE_UINT64 + dims: [ 1 ] + optional: true + }, + { + name: "return_log_probs" + data_type: TYPE_BOOL + dims: [ 1 ] + optional: true + }, + { + name: "return_context_logits" + data_type: TYPE_BOOL + dims: [ 1 ] + optional: true + }, + { + name: "return_generation_logits" + data_type: TYPE_BOOL + dims: [ 1 ] + optional: true + }, + { + name: "beam_width" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + }, + { + name: "stream" + data_type: TYPE_BOOL + dims: [ 1 ] + optional: true + }, + { + name: "prompt_embedding_table" + data_type: TYPE_FP16 + dims: [ -1, -1 ] + optional: true + }, + { + name: "prompt_vocab_size" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + }, + { + name: "embedding_bias_words" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true + }, + { + name: "embedding_bias_weights" + data_type: TYPE_FP32 + dims: [ -1 ] + optional: true + } +] +output [ + { + name: "text_output" + data_type: TYPE_STRING + dims: [ -1 ] + }, + { + name: "cum_log_probs" + data_type: TYPE_FP32 + dims: [ -1 ] + }, + { + name: "output_log_probs" + data_type: TYPE_FP32 + dims: [ -1, -1 ] + }, + { + name: "context_logits" + data_type: TYPE_FP32 + dims: [ -1, -1 ] + }, + { + name: "generation_logits" + data_type: TYPE_FP32 + dims: [ -1, -1, -1 ] + } +] +ensemble_scheduling { + step [ + { + model_name: "preprocessing" + model_version: -1 + input_map { + key: "QUERY" + value: "text_input" + } + input_map { + key: "DECODER_QUERY" + value: "decoder_text_input" + } + input_map { + key: "REQUEST_OUTPUT_LEN" + value: "max_tokens" + } + input_map { + key: "BAD_WORDS_DICT" + value: "bad_words" + } + input_map { + key: "STOP_WORDS_DICT" + value: "stop_words" + } + input_map { + key: "EMBEDDING_BIAS_WORDS" + value: "embedding_bias_words" + } + input_map { + key: "EMBEDDING_BIAS_WEIGHTS" + value: "embedding_bias_weights" + } + input_map { + key: "END_ID" + value: "end_id" + } + input_map { + key: "PAD_ID" + value: "pad_id" + } + output_map { + key: "REQUEST_INPUT_LEN" + value: "_REQUEST_INPUT_LEN" + } + output_map { + key: "INPUT_ID" + value: "_INPUT_ID" + } + output_map { + key: "REQUEST_DECODER_INPUT_LEN" + value: "_REQUEST_DECODER_INPUT_LEN" + } + output_map { + key: "DECODER_INPUT_ID" + value: "_DECODER_INPUT_ID" + } + output_map { + key: "REQUEST_OUTPUT_LEN" + value: "_REQUEST_OUTPUT_LEN" + } + output_map { + key: "STOP_WORDS_IDS" + value: "_STOP_WORDS_IDS" + } + output_map { + key: "BAD_WORDS_IDS" + value: "_BAD_WORDS_IDS" + } + output_map { + key: "EMBEDDING_BIAS" + value: "_EMBEDDING_BIAS" + } + output_map { + key: "OUT_END_ID" + value: "_PREPROCESSOR_END_ID" + } + output_map { + key: "OUT_PAD_ID" + value: "_PREPROCESSOR_PAD_ID" + } + }, + { + model_name: "tensorrt_llm" + model_version: -1 + input_map { + key: "input_ids" + value: "_INPUT_ID" + } + input_map { + key: "decoder_input_ids" + value: "_DECODER_INPUT_ID" + } + input_map { + key: "input_lengths" + value: "_REQUEST_INPUT_LEN" + } + input_map { + key: "decoder_input_lengths" + value: "_REQUEST_DECODER_INPUT_LEN" + } + input_map { + key: "request_output_len" + value: "_REQUEST_OUTPUT_LEN" + } + input_map { + key: "end_id" + value: "_PREPROCESSOR_END_ID" + } + input_map { + key: "pad_id" + value: "_PREPROCESSOR_PAD_ID" + } + input_map { + key: "embedding_bias" + value: "_EMBEDDING_BIAS" + } + input_map { + key: "runtime_top_k" + value: "top_k" + } + input_map { + key: "runtime_top_p" + value: "top_p" + } + input_map { + key: "temperature" + value: "temperature" + } + input_map { + key: "len_penalty" + value: "length_penalty" + } + input_map { + key: "repetition_penalty" + value: "repetition_penalty" + } + input_map { + key: "min_length" + value: "min_length" + } + input_map { + key: "presence_penalty" + value: "presence_penalty" + } + input_map { + key: "frequency_penalty" + value: "frequency_penalty" + } + input_map { + key: "random_seed" + value: "random_seed" + } + input_map { + key: "return_log_probs" + value: "return_log_probs" + } + input_map { + key: "return_context_logits" + value: "return_context_logits" + } + input_map { + key: "return_generation_logits" + value: "return_generation_logits" + } + input_map { + key: "beam_width" + value: "beam_width" + } + input_map { + key: "streaming" + value: "stream" + } + input_map { + key: "prompt_embedding_table" + value: "prompt_embedding_table" + } + input_map { + key: "prompt_vocab_size" + value: "prompt_vocab_size" + } + input_map { + key: "stop_words_list" + value: "_STOP_WORDS_IDS" + } + input_map { + key: "bad_words_list" + value: "_BAD_WORDS_IDS" + } + output_map { + key: "output_ids" + value: "_TOKENS_BATCH" + } + output_map { + key: "sequence_length" + value: "_SEQUENCE_LENGTH" + }, + output_map { + key: "cum_log_probs" + value: "_CUM_LOG_PROBS" + } + output_map { + key: "output_log_probs" + value: "_OUTPUT_LOG_PROBS" + }, + output_map { + key: "context_logits" + value: "_CONTEXT_LOGITS" + }, + output_map { + key: "generation_logits" + value: "_GENERATION_LOGITS" + } + }, + { + model_name: "postprocessing" + model_version: -1 + input_map { + key: "TOKENS_BATCH" + value: "_TOKENS_BATCH" + } + input_map { + key: "CUM_LOG_PROBS" + value: "_CUM_LOG_PROBS" + } + input_map { + key: "OUTPUT_LOG_PROBS" + value: "_OUTPUT_LOG_PROBS" + } + input_map { + key: "CONTEXT_LOGITS" + value: "_CONTEXT_LOGITS" + } + input_map { + key: "GENERATION_LOGITS" + value: "_GENERATION_LOGITS" + } + input_map { + key: "SEQUENCE_LENGTH" + value: "_SEQUENCE_LENGTH" + } + output_map { + key: "OUTPUT" + value: "text_output" + } + output_map { + key: "OUT_OUTPUT_LOG_PROBS" + value: "output_log_probs" + } + output_map { + key: "OUT_CUM_LOG_PROBS" + value: "cum_log_probs" + } + output_map { + key: "OUT_CONTEXT_LOGITS" + value: "context_logits" + } + output_map { + key: "OUT_GENERATION_LOGITS" + value: "generation_logits" + } + } + ] +} diff --git a/python/openai/openai/tests/tensorrtllm_models/postprocessing/1/model.py b/python/openai/openai/tests/tensorrtllm_models/postprocessing/1/model.py new file mode 100644 index 0000000000..0812e19b3e --- /dev/null +++ b/python/openai/openai/tests/tensorrtllm_models/postprocessing/1/model.py @@ -0,0 +1,246 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json + +import numpy as np +import triton_python_backend_utils as pb_utils +from transformers import AutoTokenizer + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + # Parse model configs + model_config = json.loads(args["model_config"]) + tokenizer_dir = model_config["parameters"]["tokenizer_dir"]["string_value"] + + skip_special_tokens = model_config["parameters"].get("skip_special_tokens") + if skip_special_tokens is not None: + skip_special_tokens_str = skip_special_tokens["string_value"].lower() + if skip_special_tokens_str in [ + "true", + "false", + "1", + "0", + "t", + "f", + "y", + "n", + "yes", + "no", + ]: + self.skip_special_tokens = skip_special_tokens_str in [ + "true", + "1", + "t", + "y", + "yes", + ] + else: + print( + f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is {skip_special_tokens['string_value']}). Set it as True by default." + ) + self.skip_special_tokens = True + else: + print( + f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens'. Set it as True by default." + ) + self.skip_special_tokens = True + + self.tokenizer = AutoTokenizer.from_pretrained( + tokenizer_dir, legacy=False, padding_side="left", trust_remote_code=True + ) + if not self.tokenizer.pad_token: + self.tokenizer.pad_token = self.tokenizer.eos_token + + # Parse model output configs + output_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT") + + # Convert Triton types to numpy types + self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"]) + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. Depending on the batching configuration (e.g. Dynamic + Batching) used, `requests` may contain multiple requests. Every + Python model, must create one pb_utils.InferenceResponse for every + pb_utils.InferenceRequest in `requests`. If there is an error, you can + set the error argument when creating a pb_utils.InferenceResponse. + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + + responses = [] + + # Every Python backend must iterate over everyone of the requests + # and create a pb_utils.InferenceResponse for each of them. + for idx, request in enumerate(requests): + # Get input tensors + tokens_batch = pb_utils.get_input_tensor_by_name( + request, "TOKENS_BATCH" + ).as_numpy() + + # Get sequence length + sequence_lengths = pb_utils.get_input_tensor_by_name( + request, "SEQUENCE_LENGTH" + ).as_numpy() + + # Get cum log probs + cum_log_probs = pb_utils.get_input_tensor_by_name(request, "CUM_LOG_PROBS") + + # Get sequence length + output_log_probs = pb_utils.get_input_tensor_by_name( + request, "OUTPUT_LOG_PROBS" + ) + + # Get context logits + context_logits = pb_utils.get_input_tensor_by_name( + request, "CONTEXT_LOGITS" + ) + + # Get generation logits + generation_logits = pb_utils.get_input_tensor_by_name( + request, "GENERATION_LOGITS" + ) + + # Reshape Input + # tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]]) + # tokens_batch = tokens_batch.T + + # Postprocessing output data. + outputs = self._postprocessing(tokens_batch, sequence_lengths) + + # Create output tensors. You need pb_utils.Tensor + # objects to create pb_utils.InferenceResponse. + output_tensor = pb_utils.Tensor( + "OUTPUT", np.array(outputs).astype(self.output_dtype) + ) + + outputs = [] + outputs.append(output_tensor) + + if cum_log_probs: + out_cum_log_probs = pb_utils.Tensor( + "OUT_CUM_LOG_PROBS", cum_log_probs.as_numpy() + ) + outputs.append(out_cum_log_probs) + else: + out_cum_log_probs = pb_utils.Tensor( + "OUT_CUM_LOG_PROBS", np.array([[0.0]], dtype=np.float32) + ) + outputs.append(out_cum_log_probs) + + if output_log_probs: + out_output_log_probs = pb_utils.Tensor( + "OUT_OUTPUT_LOG_PROBS", output_log_probs.as_numpy() + ) + outputs.append(out_output_log_probs) + else: + out_output_log_probs = pb_utils.Tensor( + "OUT_OUTPUT_LOG_PROBS", np.array([[[0.0]]], dtype=np.float32) + ) + outputs.append(out_output_log_probs) + + if context_logits: + out_context_logits = pb_utils.Tensor( + "OUT_CONTEXT_LOGITS", context_logits.as_numpy() + ) + outputs.append(out_context_logits) + else: + out_context_logits = pb_utils.Tensor( + "OUT_CONTEXT_LOGITS", np.array([[[0.0]]], dtype=np.float32) + ) + outputs.append(out_context_logits) + + if generation_logits: + out_generation_logits = pb_utils.Tensor( + "OUT_GENERATION_LOGITS", generation_logits.as_numpy() + ) + outputs.append(out_generation_logits) + else: + out_generation_logits = pb_utils.Tensor( + "OUT_GENERATION_LOGITS", np.array([[[[0.0]]]], dtype=np.float32) + ) + outputs.append(out_generation_logits) + + # Create InferenceResponse. You can set an error here in case + # there was a problem with handling this inference request. + # Below is an example of how you can set errors in inference + # response: + # + # pb_utils.InferenceResponse( + # output_tensors=..., TritonError("An error occurred")) + inference_response = pb_utils.InferenceResponse(output_tensors=outputs) + responses.append(inference_response) + + # You should return a list of pb_utils.InferenceResponse. Length + # of this list must match the length of `requests` list. + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is optional. This function allows + the model to perform any necessary clean ups before exit. + """ + print("Cleaning up...") + + def _postprocessing(self, tokens_batch, sequence_lengths): + outputs = [] + for batch_idx, beam_tokens in enumerate(tokens_batch): + for beam_idx, tokens in enumerate(beam_tokens): + seq_len = sequence_lengths[batch_idx][beam_idx] + output = self.tokenizer.decode( + tokens[:seq_len], skip_special_tokens=self.skip_special_tokens + ) + outputs.append(output.encode("utf8")) + return outputs diff --git a/python/openai/openai/tests/tensorrtllm_models/postprocessing/config.pbtxt b/python/openai/openai/tests/tensorrtllm_models/postprocessing/config.pbtxt new file mode 100644 index 0000000000..dee851662d --- /dev/null +++ b/python/openai/openai/tests/tensorrtllm_models/postprocessing/config.pbtxt @@ -0,0 +1,113 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "postprocessing" +backend: "python" +max_batch_size: 256 +input [ + { + name: "TOKENS_BATCH" + data_type: TYPE_INT32 + dims: [ -1, -1 ] + }, + { + name: "SEQUENCE_LENGTH" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "CUM_LOG_PROBS" + data_type: TYPE_FP32 + dims: [ -1 ] + optional: true + }, + { + name: "OUTPUT_LOG_PROBS" + data_type: TYPE_FP32 + dims: [ -1, -1 ] + optional: true + }, + { + name: "CONTEXT_LOGITS" + data_type: TYPE_FP32 + dims: [ -1, -1 ] + optional: true + }, + { + name: "GENERATION_LOGITS" + data_type: TYPE_FP32 + dims: [ -1, -1, -1 ] + optional: true + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_STRING + dims: [ -1 ] + }, + { + name: "OUT_CUM_LOG_PROBS" + data_type: TYPE_FP32 + dims: [ -1 ] + }, + { + name: "OUT_OUTPUT_LOG_PROBS" + data_type: TYPE_FP32 + dims: [ -1, -1 ] + }, + { + name: "OUT_CONTEXT_LOGITS" + data_type: TYPE_FP32 + dims: [ -1, -1 ] + }, + { + name: "OUT_GENERATION_LOGITS" + data_type: TYPE_FP32 + dims: [ -1, -1, -1 ] + } +] + +parameters { + key: "tokenizer_dir" + value: { + string_value: "/tmp/engines/llama-3-8b-instruct/hf_download" + } +} + +parameters { + key: "skip_special_tokens" + value: { + string_value: "${skip_special_tokens}" + } +} + +instance_group [ + { + count: 1 + kind: KIND_CPU + } +] diff --git a/python/openai/openai/tests/tensorrtllm_models/preprocessing/1/model.py b/python/openai/openai/tests/tensorrtllm_models/preprocessing/1/model.py new file mode 100644 index 0000000000..eb4487c803 --- /dev/null +++ b/python/openai/openai/tests/tensorrtllm_models/preprocessing/1/model.py @@ -0,0 +1,418 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +from typing import List + +import numpy as np +import triton_python_backend_utils as pb_utils +from transformers import AutoTokenizer, T5Tokenizer + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + # Parse model configs + model_config = json.loads(args["model_config"]) + tokenizer_dir = model_config["parameters"]["tokenizer_dir"]["string_value"] + + add_special_tokens = model_config["parameters"].get("add_special_tokens") + if add_special_tokens is not None: + add_special_tokens_str = add_special_tokens["string_value"].lower() + if add_special_tokens_str in [ + "true", + "false", + "1", + "0", + "t", + "f", + "y", + "n", + "yes", + "no", + ]: + self.add_special_tokens = add_special_tokens_str in [ + "true", + "1", + "t", + "y", + "yes", + ] + else: + print( + f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens' correctly (set value is {add_special_tokens['string_value']}). Set it as True by default." + ) + self.add_special_tokens = True + else: + print( + f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens'. Set it as True by default." + ) + self.add_special_tokens = True + + self.tokenizer = AutoTokenizer.from_pretrained( + tokenizer_dir, legacy=False, padding_side="left", trust_remote_code=True + ) + if isinstance(self.tokenizer, T5Tokenizer): + self.tokenizer_bos_id = self.tokenizer.sp_model.bos_id() + + if not self.tokenizer.pad_token: + self.tokenizer.pad_token = self.tokenizer.eos_token + + self.tokenizer_end_id = self.tokenizer.encode( + self.tokenizer.eos_token, add_special_tokens=False + )[0] + self.tokenizer_pad_id = self.tokenizer.encode( + self.tokenizer.pad_token, add_special_tokens=False + )[0] + + # Parse model output configs and convert Triton types to numpy types + output_names = [ + "INPUT_ID", + "DECODER_INPUT_ID", + "REQUEST_INPUT_LEN", + "REQUEST_DECODER_INPUT_LEN", + "BAD_WORDS_IDS", + "STOP_WORDS_IDS", + "OUT_END_ID", + "OUT_PAD_ID", + ] + input_names = ["EMBEDDING_BIAS_WORDS", "EMBEDDING_BIAS_WEIGHTS"] + for input_name in input_names: + setattr( + self, + input_name.lower() + "_dtype", + pb_utils.triton_string_to_numpy( + pb_utils.get_input_config_by_name(model_config, input_name)[ + "data_type" + ] + ), + ) + + for output_name in output_names: + setattr( + self, + output_name.lower() + "_dtype", + pb_utils.triton_string_to_numpy( + pb_utils.get_output_config_by_name(model_config, output_name)[ + "data_type" + ] + ), + ) + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. Depending on the batching configuration (e.g. Dynamic + Batching) used, `requests` may contain multiple requests. Every + Python model, must create one pb_utils.InferenceResponse for every + pb_utils.InferenceRequest in `requests`. If there is an error, you can + set the error argument when creating a pb_utils.InferenceResponse. + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + + responses = [] + + # Every Python backend must iterate over everyone of the requests + # and create a pb_utils.InferenceResponse for each of them. + logger = pb_utils.Logger + for idx, request in enumerate(requests): + # Get input tensors + query = pb_utils.get_input_tensor_by_name(request, "QUERY").as_numpy() + decoder_query = pb_utils.get_input_tensor_by_name(request, "DECODER_QUERY") + if decoder_query is not None: + decoder_query = decoder_query.as_numpy() + + batch_dim = query.shape[0] + if batch_dim != 1: + err_str = ( + "Inflight batching backend expects requests with batch size of 1." + ) + logger.log_error(err_str) + responses.append( + pb_utils.InferenceResponse( + output_tensors=[], error=pb_utils.TritonError(err_str) + ) + ) + continue + + request_output_len = pb_utils.get_input_tensor_by_name( + request, "REQUEST_OUTPUT_LEN" + ).as_numpy() + + bad_words_dict = pb_utils.get_input_tensor_by_name( + request, "BAD_WORDS_DICT" + ) + if bad_words_dict is not None: + bad_words_dict = bad_words_dict.as_numpy() + + stop_words_dict = pb_utils.get_input_tensor_by_name( + request, "STOP_WORDS_DICT" + ) + if stop_words_dict is not None: + stop_words_dict = stop_words_dict.as_numpy() + + embedding_bias_words = pb_utils.get_input_tensor_by_name( + request, "EMBEDDING_BIAS_WORDS" + ) + if embedding_bias_words is not None: + embedding_bias_words = embedding_bias_words.as_numpy() + + embedding_bias_weights = pb_utils.get_input_tensor_by_name( + request, "EMBEDDING_BIAS_WEIGHTS" + ) + if embedding_bias_weights is not None: + embedding_bias_weights = embedding_bias_weights.as_numpy() + + # Take the end_id from the input tensors + # If not specified, use tokenizer to get end_id + end_id = pb_utils.get_input_tensor_by_name(request, "END_ID") + if end_id is not None: + end_id = end_id.as_numpy() + else: + end_id = [[self.tokenizer_end_id]] + + # Take the pad_id from the input tensors + # If not specified, use tokenizer to get pad_id + pad_id = pb_utils.get_input_tensor_by_name(request, "PAD_ID") + if pad_id is not None: + pad_id = pad_id.as_numpy() + else: + pad_id = [[self.tokenizer_pad_id]] + + # Preprocessing input data. + input_id, request_input_len = self._create_request(query) + if decoder_query is not None: + decoder_input_id, request_decoder_input_len = self._create_request( + decoder_query + ) + else: + decoder_input_id = pad_id * np.ones((1, 1), np.int32) + request_decoder_input_len = 1 * np.ones((1, 1), np.int32) + + bad_words = self._to_word_list_format(bad_words_dict) + stop_words = self._to_word_list_format(stop_words_dict) + + embedding_bias = self._get_embedding_bias( + embedding_bias_words, + embedding_bias_weights, + self.embedding_bias_weights_dtype, + ) + + # Create output tensors. You need pb_utils.Tensor + # objects to create pb_utils.InferenceResponse. + input_id_tensor = pb_utils.Tensor( + "INPUT_ID", input_id.astype(self.input_id_dtype) + ) + request_input_len_tensor = pb_utils.Tensor( + "REQUEST_INPUT_LEN", + request_input_len.astype(self.request_input_len_dtype), + ) + decoder_input_id_tensor = pb_utils.Tensor( + "DECODER_INPUT_ID", decoder_input_id.astype(self.decoder_input_id_dtype) + ) + request_decoder_input_len_tensor = pb_utils.Tensor( + "REQUEST_DECODER_INPUT_LEN", + request_decoder_input_len.astype(self.request_decoder_input_len_dtype), + ) + request_output_len_tensor = pb_utils.Tensor( + "REQUEST_OUTPUT_LEN", request_output_len + ) + bad_words_ids_tensor = pb_utils.Tensor("BAD_WORDS_IDS", bad_words) + stop_words_ids_tensor = pb_utils.Tensor("STOP_WORDS_IDS", stop_words) + embedding_bias_tensor = pb_utils.Tensor("EMBEDDING_BIAS", embedding_bias) + end_id_tensor = pb_utils.Tensor( + "OUT_END_ID", np.array(end_id, dtype=np.int32) + ) + pad_id_tensor = pb_utils.Tensor( + "OUT_PAD_ID", np.array(pad_id, dtype=np.int32) + ) + + inference_response = pb_utils.InferenceResponse( + output_tensors=[ + input_id_tensor, + decoder_input_id_tensor, + bad_words_ids_tensor, + stop_words_ids_tensor, + request_input_len_tensor, + request_decoder_input_len_tensor, + request_output_len_tensor, + embedding_bias_tensor, + end_id_tensor, + pad_id_tensor, + ] + ) + responses.append(inference_response) + + # You should return a list of pb_utils.InferenceResponse. Length + # of this list must match the length of `requests` list. + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is optional. This function allows + the model to perform any necessary clean ups before exit. + """ + print("Cleaning up...") + + def _create_request(self, query): + """ + query : batch string (2D numpy array) + """ + if isinstance(self.tokenizer, T5Tokenizer): + start_ids = [ + np.array( + [self.tokenizer_bos_id] + + self.tokenizer.encode( + s[0].decode(), add_special_tokens=self.add_special_tokens + ) + ).astype(int) + for s in query + ] + else: + start_ids = [ + np.array( + self.tokenizer.encode( + s[0].decode(), add_special_tokens=self.add_special_tokens + ) + ).astype(int) + for s in query + ] + start_lengths = np.array([[len(ids)] for ids in start_ids]).astype(int) + + max_len = 0 + for seq in start_ids: + max_len = max(max_len, seq.shape[0]) + start_ids = np.stack( + [ + np.pad( + seq, + (0, max_len - seq.shape[0]), + "constant", + constant_values=(0, self.tokenizer_pad_id), + ) + for seq in start_ids + ] + ) + + return start_ids, start_lengths + + def _to_word_list_format(self, word_lists: List[List[str | bytes]]): + """ + word_lists format: + len(word_lists) == batch_size + word_lists[i] means the words associated to batch item i. A "word" may actually be any string. Like "lorem" or "lorem ipsum". + """ + assert self.tokenizer != None, "need to set tokenizer" + + if word_lists is None: + # Return an empty array of shape (1,2,0) + return np.empty([1, 2, 0], dtype="int32") + + flat_ids = [] + offsets = [] + for word_list in word_lists: + item_flat_ids = [] + item_offsets = [] + + for word in word_list: + if isinstance(word, bytes): + word = word.decode() + + ids = self.tokenizer.encode(word, add_special_tokens=False) + if len(ids) == 0: + continue + + item_flat_ids += ids + item_offsets.append(len(ids)) + + flat_ids.append(np.array(item_flat_ids)) + offsets.append(np.cumsum(np.array(item_offsets))) + + pad_to = max(1, max(len(ids) for ids in flat_ids)) + + for i, (ids, offs) in enumerate(zip(flat_ids, offsets)): + flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)), constant_values=0) + offsets[i] = np.pad(offs, (0, pad_to - len(offs)), constant_values=-1) + + return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2)) + + def _get_embedding_bias( + self, embedding_bias_words, embedding_bias_weights, bias_dtype + ): + assert self.tokenizer != None, "need to set tokenizer" + + if embedding_bias_words is None or embedding_bias_weights is None: + return np.empty([1, 0], dtype=self.embedding_bias_weights_dtype) + + batch_embedding_bias = [] + for words, weights in zip(embedding_bias_words, embedding_bias_weights): + vocab_size = self.tokenizer.vocab_size + embedding_bias = [0.0] * vocab_size + + assert len(words) == len( + weights + ), "Embedding bias words must have same dimension as embedding bias weights" + + for word, weight in zip(words, weights): + if isinstance(word, bytes): + word = word.decode() + ids = self.tokenizer.encode(word) + + if len(ids) == 0: + continue + + for id in ids: + embedding_bias[id] += weight + + batch_embedding_bias.append(np.array(embedding_bias)) + + return np.array(batch_embedding_bias, dtype=bias_dtype) diff --git a/python/openai/openai/tests/tensorrtllm_models/preprocessing/config.pbtxt b/python/openai/openai/tests/tensorrtllm_models/preprocessing/config.pbtxt new file mode 100644 index 0000000000..a262cf6983 --- /dev/null +++ b/python/openai/openai/tests/tensorrtllm_models/preprocessing/config.pbtxt @@ -0,0 +1,156 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "preprocessing" +backend: "python" +max_batch_size: 256 +input [ + { + name: "QUERY" + data_type: TYPE_STRING + dims: [ -1 ] + }, + { + name: "DECODER_QUERY" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true + }, + { + name: "REQUEST_OUTPUT_LEN" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "BAD_WORDS_DICT" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true + }, + { + name: "STOP_WORDS_DICT" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true + }, + { + name: "EMBEDDING_BIAS_WORDS" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true + }, + { + name: "EMBEDDING_BIAS_WEIGHTS" + data_type: TYPE_FP32 + dims: [ -1 ] + optional: true + }, + { + name: "END_ID" + data_type: TYPE_INT32 + dims: [ -1 ] + optional: true + }, + { + name: "PAD_ID" + data_type: TYPE_INT32 + dims: [ -1 ] + optional: true + } +] +output [ + { + name: "INPUT_ID" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "REQUEST_INPUT_LEN" + data_type: TYPE_INT32 + dims: [ 1 ] + }, + { + name: "DECODER_INPUT_ID" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "REQUEST_DECODER_INPUT_LEN" + data_type: TYPE_INT32 + dims: [ 1 ] + }, + { + name: "BAD_WORDS_IDS" + data_type: TYPE_INT32 + dims: [ 2, -1 ] + }, + { + name: "STOP_WORDS_IDS" + data_type: TYPE_INT32 + dims: [ 2, -1 ] + }, + { + name: "EMBEDDING_BIAS" + data_type: TYPE_FP32 + dims: [ -1 ] + }, + { + name: "REQUEST_OUTPUT_LEN" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "OUT_END_ID" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "OUT_PAD_ID" + data_type: TYPE_INT32 + dims: [ -1 ] + } +] + +parameters { + key: "tokenizer_dir" + value: { + string_value: "/tmp/engines/llama-3-8b-instruct/hf_download" + } +} + +parameters { + key: "add_special_tokens" + value: { + string_value: "${add_special_tokens}" + } +} + +instance_group [ + { + count: 1 + kind: KIND_CPU + } +] diff --git a/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm/1/.gitkeep b/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm/1/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm/1/model.py b/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm/1/model.py new file mode 100644 index 0000000000..3425a20f57 --- /dev/null +++ b/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm/1/model.py @@ -0,0 +1,797 @@ +import datetime +import json +import os +import time +from threading import Lock, Thread + +import numpy as np +import tensorrt_llm.bindings.executor as trtllm +import triton_python_backend_utils as pb_utils +from torch import from_numpy + + +def get_input_tensor_by_name(request, name): + tensor = pb_utils.get_input_tensor_by_name(request, name) + if tensor is None: + if name == "temperature": + print(f"Tensor for {name} is None!") + return None + return tensor.as_numpy() + + +def get_input_scalar_by_name(request, name): + tensor = get_input_tensor_by_name(request, name) + if tensor is None: + if name == "temperature": + print(f"Scalar for {name} is None!") + return None + if tensor.size != 1: + raise pb_utils.TritonModelException(f"Expected a single value for {name}") + return tensor.item() + + +def read_parameter_as_type(value, name, pytype=str): + if value == "": + return None + if value.startswith("${") and value.endswith("}"): + return None + if pytype is bool: + return value.lower() in ["1", "true"] + try: + result = pytype(value) + return result + except: + pb_utils.Logger.log_warning( + f"Could not read parameter '{name}' with value '{value}', will use default." + ) + return None + + +def get_parameter(model_config, name, pytype=str): + if name not in model_config["parameters"]: + return None + return read_parameter_as_type( + model_config["parameters"][name]["string_value"], name, pytype + ) + + +def convert_word_list(word_list): + if word_list is None: + return None + word_list = word_list.tolist() + if len(word_list) == 0 or len(word_list[0]) != 2: + raise pb_utils.TritonModelException(f"Invalid format for word list.") + words, indices = word_list[0] + result = [] + current_index = 0 + for i in indices: + if i == -1: + continue + if i > len(words): + raise pb_utils.TritonModelException(f"Invalid format for word list.") + current_word = [] + while current_index < i: + current_word.append(words[current_index]) + current_index += 1 + result.append(current_word) + return result + + +def parse_medusa_choices(medusa_choices): + if medusa_choices is None: + return None + try: + result = json.loads( + "[" + medusa_choices.replace("{", "[").replace("}", "]") + "]" + ) + assert isinstance(result, list) and len(result) > 0 + assert all([isinstance(x, list) for x in result]) + assert all([isinstance(y, int) for x in result for y in x]) + except Exception: + raise pb_utils.TritonModelException("Invalid format for medusa_choices") + return result + + +def get_sampling_config_from_request(request): + kwargs = {} + kwargs["beam_width"] = get_input_scalar_by_name(request, "beam_width") or 1 + kwargs["top_k"] = get_input_scalar_by_name(request, "runtime_top_k") + kwargs["top_p"] = get_input_scalar_by_name(request, "runtime_top_p") + kwargs["top_p"] = ( + None if kwargs["top_p"] is None or kwargs["top_p"] <= 0 else kwargs["top_p"] + ) + kwargs["random_seed"] = get_input_scalar_by_name(request, "random_seed") + kwargs["temperature"] = get_input_scalar_by_name(request, "temperature") + # print(f"=========== [DEBUG] [trtllm python runtime model.py] {kwargs['temperature']=} ==========") + kwargs["min_length"] = get_input_scalar_by_name(request, "min_length") + kwargs["repetition_penalty"] = get_input_scalar_by_name( + request, "repetition_penalty" + ) + kwargs["presence_penalty"] = get_input_scalar_by_name(request, "presence_penalty") + kwargs["frequency_penalty"] = get_input_scalar_by_name(request, "frequency_penalty") + kwargs["length_penalty"] = get_input_scalar_by_name(request, "len_penalty") + kwargs["top_p_min"] = get_input_scalar_by_name(request, "runtime_top_p_min") + kwargs["top_p_reset_ids"] = get_input_scalar_by_name( + request, "runtime_top_p_reset_ids" + ) + kwargs["top_p_decay"] = get_input_scalar_by_name(request, "runtime_top_p_decay") + kwargs["beam_search_diversity_rate"] = get_input_scalar_by_name( + request, "beam_search_diversity_rate" + ) + kwargs["early_stopping"] = get_input_scalar_by_name(request, "early_stopping") + kwargs = {k: v for k, v in kwargs.items() if v is not None} + return trtllm.SamplingConfig(**kwargs) + + +def get_output_config_from_request(request, exclude_input_from_output): + kwargs = {} + kwargs["return_log_probs"] = get_input_scalar_by_name(request, "return_log_probs") + kwargs["return_context_logits"] = get_input_scalar_by_name( + request, "return_context_logits" + ) + kwargs["return_generation_logits"] = get_input_scalar_by_name( + request, "return_generation_logits" + ) + kwargs["exclude_input_from_output"] = exclude_input_from_output + kwargs = {k: v for k, v in kwargs.items() if v is not None} + return trtllm.OutputConfig(**kwargs) + + +def get_external_draft_tokens_config_from_request(request): + kwargs = {} + draft_input_ids = get_input_tensor_by_name(request, "draft_input_ids") + if draft_input_ids is not None: + kwargs["tokens"] = draft_input_ids.tolist() + draft_logits = get_input_tensor_by_name(request, "draft_logits") + if draft_logits is not None: + kwargs["logits"] = from_numpy(draft_logits) + kwargs["acceptance_threshold"] = get_input_scalar_by_name( + request, "draft_acceptance_threshold" + ) + kwargs = {k: v for k, v in kwargs.items() if v is not None} + if len(kwargs) > 0: + return trtllm.ExternalDraftTokensConfig(**kwargs) + return None + + +def get_prompt_tuning_config_from_request(request): + # prompt_vocab_size is unused by executor. + kwargs = {} + prompt_embedding_table = get_input_tensor_by_name(request, "prompt_embedding_table") + if prompt_embedding_table is not None: + kwargs["embedding_table"] = from_numpy(prompt_embedding_table) + kwargs = {k: v for k, v in kwargs.items() if v is not None} + if len(kwargs) > 0: + return trtllm.PromptTuningConfig(**kwargs) + return None + + +def get_lora_config_from_request(request): + kwargs = {} + kwargs["task_id"] = get_input_scalar_by_name(request, "lora_task_id") + lora_weights = get_input_tensor_by_name(request, "lora_weights") + if lora_weights is not None: + kwargs["weights"] = from_numpy(lora_weights) + lora_config = get_input_tensor_by_name(request, "lora_config") + if lora_config is not None: + kwargs["config"] = from_numpy(lora_config) + kwargs = {k: v for k, v in kwargs.items() if v is not None} + if len(kwargs) > 0: + return trtllm.LoraConfig(**kwargs) + return None + + +def convert_request(request, exclude_input_from_output, decoupled): + inputs = {} + input_token_ids = get_input_tensor_by_name(request, "input_ids") + if input_token_ids is None: + raise pb_utils.TritonModelException("A value is required for input_ids") + input_token_ids = input_token_ids.tolist() + if len(input_token_ids) == 0: + raise pb_utils.TritonModelException(f"Invalid format for input_ids") + inputs["input_token_ids"] = input_token_ids[0] + # input_lengths is not not used by executor. + inputs["max_new_tokens"] = get_input_scalar_by_name(request, "request_output_len") + if inputs["max_new_tokens"] is None: + raise pb_utils.TritonModelException( + "A value is required for request_output_len" + ) + inputs["streaming"] = get_input_scalar_by_name(request, "streaming") + if inputs["streaming"] and not decoupled: + raise pb_utils.TritonModelException( + "Streaming is only supported in decoupled mode." + ) + inputs["end_id"] = get_input_scalar_by_name(request, "end_id") + inputs["pad_id"] = get_input_scalar_by_name(request, "pad_id") + inputs["stop_words"] = convert_word_list( + get_input_tensor_by_name(request, "stop_words_list") + ) + inputs["bad_words"] = convert_word_list( + get_input_tensor_by_name(request, "bad_words_list") + ) + embedding_bias = get_input_tensor_by_name(request, "embedding_bias") + if embedding_bias is not None and embedding_bias.size != 0: + inputs["embedding_bias"] = from_numpy(embedding_bias).squeeze() + + sampling_config = get_sampling_config_from_request(request) + output_config = get_output_config_from_request(request, exclude_input_from_output) + external_draft_tokens_config = get_external_draft_tokens_config_from_request( + request + ) + prompt_tuning_config = get_prompt_tuning_config_from_request(request) + lora_config = get_lora_config_from_request(request) + + return trtllm.Request( + **inputs, + sampling_config=sampling_config, + output_config=output_config, + external_draft_tokens_config=external_draft_tokens_config, + prompt_tuning_config=prompt_tuning_config, + lora_config=lora_config, + ) + + +def convert_response(response): + if response.has_error(): + return ( + pb_utils.InferenceResponse( + output_tensors=[], error=pb_utils.TritonError(response.error_msg) + ), + True, + ) + result = response.result + beam_lengths = np.expand_dims( + np.array([len(beam) for beam in result.output_token_ids], np.int32), 0 + ) + max_beam_length = max([len(beam) for beam in result.output_token_ids]) + output_ids = np.full( + (1, len(result.output_token_ids), max_beam_length), -1, np.int32 + ) + for idx, beam in enumerate(result.output_token_ids): + output_ids[0, idx, : len(beam)] = beam + output_tensors = [ + pb_utils.Tensor("output_ids", output_ids), + pb_utils.Tensor("sequence_length", beam_lengths), + ] + output_tensors.append( + pb_utils.Tensor( + "cum_log_probs", + np.expand_dims(np.array(result.cum_log_probs, np.float32), 0) + if result.cum_log_probs is not None + else np.zeros((1, 1), np.float32), + ) + ) + output_tensors.append( + pb_utils.Tensor( + "output_log_probs", + np.expand_dims(np.array(result.log_probs, np.float32), 0) + if result.log_probs is not None + else np.zeros((1, 1, 1), np.float32), + ) + ) + output_tensors.append( + pb_utils.Tensor( + "context_logits", + np.expand_dims(np.array(result.context_logits, np.float32), 0) + if result.context_logits is not None + else np.zeros((1, 1, 1), np.float32), + ) + ) + output_tensors.append( + pb_utils.Tensor( + "generation_logits", + np.expand_dims(np.array(result.generation_logits, np.float32), 0) + if result.generation_logits is not None + else np.zeros((1, 1, 1, 1), np.float32), + ) + ) + return pb_utils.InferenceResponse(output_tensors), result.is_final + + +def convert_scheduler_policy(batch_scheduler_policy: str): + if batch_scheduler_policy.lower() == "max_utilization": + return trtllm.CapacitySchedulerPolicy.MAX_UTILIZATION + elif batch_scheduler_policy.lower() == "guaranteed_no_evict": + return trtllm.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT + raise pb_utils.TritonModelException( + f"batch_scheduler_policy value of '{batch_scheduler_policy}' is not supported." + ) + + +def convert_batching_type(gpt_model_type: str): + if gpt_model_type is None: + return None + if ( + gpt_model_type.lower() == "inflight_fused_batching" + or gpt_model_type.lower() == "inflight_batching" + ): + return trtllm.BatchingType.INFLIGHT + elif gpt_model_type.lower() == "v1": + return trtllm.BatchingType.STATIC + raise pb_utils.TritonModelException( + f"gpt_model_type value of '{gpt_model_type}' is not supported." + ) + + +def convert_decoding_mode(decoding_mode: str): + if decoding_mode is None: + return None + elif decoding_mode == "auto": + return trtllm.DecodingMode.Auto() + elif decoding_mode == "top_k": + return trtllm.DecodingMode.TopK() + elif decoding_mode == "top_p": + return trtllm.DecodingMode.TopP() + elif decoding_mode == "top_k_top_p": + return trtllm.DecodingMode.TopKTopP() + elif decoding_mode == "beam_search": + return trtllm.DecodingMode.BeamSearch() + elif decoding_mode == "medusa": + return trtllm.DecodingMode.Medusa() + raise pb_utils.TritonModelException( + f"decoding_mode value of '{decoding_mode}' is not supported." + ) + + +def convert_timestamp_to_seconds(timestamp: str): + return int(datetime.datetime.strptime(timestamp, "%m-%d-%Y %H:%M:%S").timestamp()) + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def get_scheduler_config(self, model_config): + batch_scheduler_policy = get_parameter(model_config, "batch_scheduler_policy") + if batch_scheduler_policy is None: + return trtllm.SchedulerConfig() + return trtllm.SchedulerConfig(convert_scheduler_policy(batch_scheduler_policy)) + + def get_kv_cache_config(self, model_config): + kwargs = { + "enable_block_reuse": get_parameter( + model_config, "enable_kv_cache_reuse", bool + ), + "max_tokens": get_parameter( + model_config, "max_tokens_in_paged_kv_cache", int + ), + "sink_token_length": get_parameter(model_config, "sink_token_length", int), + "max_attention_window": get_parameter( + model_config, "max_attention_window_size", int + ), + "free_gpu_memory_fraction": get_parameter( + model_config, "kv_cache_free_gpu_mem_fraction", float + ), + "host_cache_size": get_parameter( + model_config, "kv_cache_host_memory_bytes", int + ), + "onboard_blocks": get_parameter( + model_config, "kv_cache_onboard_blocks", bool + ), + } + kwargs = {k: v for k, v in kwargs.items() if v is not None} + return trtllm.KvCacheConfig(**kwargs) + + def get_parallel_config(self, model_config): + kwargs = {} + gpu_device_ids = get_parameter(model_config, "gpu_device_ids") + if gpu_device_ids: + kwargs["device_ids"] = [int(x) for x in gpu_device_ids.split(",")] + self.use_orchestrator_mode = os.environ.get("TRTLLM_ORCHESTRATOR", "0") == "1" + if self.use_orchestrator_mode: + kwargs["communication_mode"] = trtllm.CommunicationMode.ORCHESTRATOR + worker_path = get_parameter(model_config, "worker_path") + if worker_path is not None: + raise pb_utils.TritonModelException( + "worker_path parameter is specified, but this is no longer supported. Please specify executor_worker_path instead to specify the location of the trtllmExecutorWorker executable." + ) + executor_worker_path = get_parameter(model_config, "executor_worker_path") + kwargs["orchestrator_config"] = trtllm.OrchestratorConfig( + True, executor_worker_path + ) + if len(kwargs) > 0: + return trtllm.ParallelConfig(**kwargs) + return None + + def get_peft_cache_config(self, model_config): + kwargs = { + "optimal_adapter_size": get_parameter( + model_config, "lora_cache_optimal_adapter_size", int + ), + "max_adapter_size": get_parameter( + model_config, "lora_cache_max_adapter_size", int + ), + "device_cache_percent": get_parameter( + model_config, "lora_cache_gpu_memory_fraction", float + ), + "host_cache_size": get_parameter( + model_config, "lora_cache_host_memory_bytes", int + ), + } + kwargs = {k: v for k, v in kwargs.items() if v is not None} + return trtllm.PeftCacheConfig(**kwargs) + + def get_decoding_config(self, model_config): + kwargs = { + "medusa_choices": parse_medusa_choices( + get_parameter(model_config, "medusa_choices") + ), + "decoding_mode": convert_decoding_mode( + get_parameter(model_config, "decoding_mode") + ), + } + print(kwargs) + kwargs = {k: v for k, v in kwargs.items() if v is not None} + return trtllm.DecodingConfig(**kwargs) + + def get_executor_config(self, model_config): + kwargs = { + "max_beam_width": get_parameter(model_config, "max_beam_width", int), + "scheduler_config": self.get_scheduler_config(model_config), + "kv_cache_config": self.get_kv_cache_config(model_config), + "enable_chunked_context": get_parameter( + model_config, "enable_chunked_context", bool + ), + "normalize_log_probs": get_parameter( + model_config, "normalize_log_probs", bool + ), + "batching_type": convert_batching_type( + get_parameter(model_config, "gpt_model_type") + ), + "parallel_config": self.get_parallel_config(model_config), + "peft_cache_config": self.get_peft_cache_config(model_config), + "decoding_config": self.get_decoding_config(model_config), + } + kwargs = {k: v for k, v in kwargs.items() if v is not None} + return trtllm.ExecutorConfig(**kwargs) + + def create_metrics(self, model: str, version: str, is_v1_model: bool): + self.request_metric_family = pb_utils.MetricFamily( + name="nv_trt_llm_request_metrics", + description="TRT LLM request metrics", + kind=pb_utils.MetricFamily.GAUGE, + ) + self.runtime_memory_metric_family = pb_utils.MetricFamily( + name="nv_trt_llm_runtime_memory_metrics", + description="TRT LLM runtime memory metrics", + kind=pb_utils.MetricFamily.GAUGE, + ) + self.kv_cache_metric_family = pb_utils.MetricFamily( + name="nv_trt_llm_kv_cache_block_metrics", + description="TRT LLM KV cache block metrics", + kind=pb_utils.MetricFamily.GAUGE, + ) + model_type = "v1" if is_v1_model else "inflight_batcher" + self.model_type_metric_family = pb_utils.MetricFamily( + name=f"nv_trt_llm_{model_type}_metrics", + description=f"TRT LLM {model_type}-specific metrics", + kind=pb_utils.MetricFamily.GAUGE, + ) + self.general_metric_family = pb_utils.MetricFamily( + name="nv_trt_llm_general_metrics", + description="General TRT LLM metrics", + kind=pb_utils.MetricFamily.GAUGE, + ) + common_labels = {"model": model, "version": version} + self.all_metrics = { + # Request metrics + "num_active_requests": self.request_metric_family.Metric( + labels={"request_type": "active", **common_labels} + ), + "max_num_active_requests": self.request_metric_family.Metric( + labels={"request_type": "max", **common_labels} + ), + "num_scheduled_requests": self.request_metric_family.Metric( + labels={"request_type": "scheduled", **common_labels} + ), + "num_context_requests": self.request_metric_family.Metric( + labels={"request_type": "context", **common_labels} + ), + # Runtime metrics + "cpu_mem_usage": self.runtime_memory_metric_family.Metric( + labels={"memory_type": "cpu", **common_labels} + ), + "gpu_mem_usage": self.runtime_memory_metric_family.Metric( + labels={"memory_type": "gpu", **common_labels} + ), + "pinned_mem_usage": self.runtime_memory_metric_family.Metric( + labels={"memory_type": "pinned", **common_labels} + ), + # KV cache metrics + "max_num_blocks": self.kv_cache_metric_family.Metric( + labels={"kv_cache_block_type": "max", **common_labels} + ), + "free_num_blocks": self.kv_cache_metric_family.Metric( + labels={"kv_cache_block_type": "free", **common_labels} + ), + "used_num_blocks": self.kv_cache_metric_family.Metric( + labels={"kv_cache_block_type": "used", **common_labels} + ), + "tokens_per_block": self.kv_cache_metric_family.Metric( + labels={"kv_cache_block_type": "tokens_per", **common_labels} + ), + # General metrics + "timestamp": self.general_metric_family.Metric( + labels={"general_type": "timestamp", **common_labels} + ), + "iter": self.general_metric_family.Metric( + labels={"general_type": "iteration_counter", **common_labels} + ), + } + if is_v1_model: + self.all_metrics.update( + { + "num_ctx_tokens": self.model_type_metric_family.Metric( + labels={ + "v1_specific_metric": "total_context_tokens", + **common_labels, + } + ), + "num_gen_tokens": self.model_type_metric_family.Metric( + labels={ + "v1_specific_metric": "total_generation_tokens", + **common_labels, + } + ), + "empty_gen_slots": self.model_type_metric_family.Metric( + labels={ + "v1_specific_metric": "empty_generation_slots", + **common_labels, + } + ), + } + ) + else: + self.all_metrics.update( + { + "num_ctx_tokens": self.model_type_metric_family.Metric( + labels={ + "inflight_batcher_specific_metric": "total_context_tokens", + **common_labels, + } + ), + "num_gen_requests": self.model_type_metric_family.Metric( + labels={ + "inflight_batcher_specific_metric": "generation_requests", + **common_labels, + } + ), + "micro_batch_id": self.model_type_metric_family.Metric( + labels={ + "inflight_batcher_specific_metric": "micro_batch_id", + **common_labels, + } + ), + "num_paused_requests": self.model_type_metric_family.Metric( + labels={ + "inflight_batcher_specific_metric": "paused_requests", + **common_labels, + } + ), + } + ) + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + model_config = json.loads(args["model_config"]) + gpt_model_path = get_parameter(model_config, "gpt_model_path") + if get_parameter(model_config, "enable_trt_overlap", bool): + raise pb_utils.TritonModelException( + f"enable_trt_overlap=true is not supported." + ) + self.exclude_input_from_output = get_parameter( + model_config, "exclude_input_in_output", bool + ) + executor_config = self.get_executor_config(model_config) + self.executor = trtllm.Executor( + gpt_model_path, trtllm.ModelType.DECODER_ONLY, executor_config + ) + self.decoupled = pb_utils.using_decoupled_model_transaction_policy(model_config) + self.cancellation_check_period_ms = ( + get_parameter(model_config, "cancellation_check_period_ms", int) or 100 + ) + self.stats_check_period_ms = ( + get_parameter(model_config, "stats_check_period_ms", int) or 100 + ) + + if not self.decoupled: + raise pb_utils.TritonModelException( + "Please enable decoupled transaction policy in the model configuration to serve this model" + ) + + self.create_metrics( + args["model_name"], + args["model_version"], + is_v1_model=executor_config.batching_type == trtllm.BatchingType.STATIC, + ) + self.triton_id_to_req_id = {} + self.req_id_to_response_sender = {} + self.lock = Lock() + self.running = False + self.awaiter_thread = Thread(target=self.awaiter_loop) + self.cancellation_thread = Thread(target=self.cancellation_loop) + self.metrics_thread = Thread(target=self.metrics_loop) + if self.executor.can_enqueue_requests(): + self.running = True + self.awaiter_thread.start() + self.cancellation_thread.start() + self.metrics_thread.start() + else: + # In leader mode, worker ranks will wait here until leader is done. + self.executor.shutdown() + + def handle_stop_request(self, triton_id, response_sender): + if triton_id is None or triton_id == "": + response_sender.send( + pb_utils.InferenceResponse( + error=pb_utils.TritonError( + "A request id must be provided for request cancellation" + ) + ), + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL, + ) + return + + if triton_id in self.triton_id_to_req_id: + req_id = self.triton_id_to_req_id[triton_id] + self.executor.cancel_request(req_id) + + response_sender.send( + pb_utils.InferenceResponse(), + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL, + ) + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. + + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + if not self.executor.can_enqueue_requests(): + return + + # Convert to executor requests. + triton_requests = [] + executor_requests = [] + for request in requests: + response_sender = request.get_response_sender() + if get_input_scalar_by_name(request, "stop"): + self.handle_stop_request(request.request_id(), response_sender) + else: + try: + converted = convert_request( + request, self.exclude_input_from_output, self.decoupled + ) + except Exception as e: + response_sender.send( + pb_utils.InferenceResponse( + error=pb_utils.TritonError( + f"An error occurred when processing the input values for request id {request.request_id()}, the error was '{e}'" + ) + ), + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL, + ) + else: + triton_requests.append(request) + executor_requests.append(converted) + + with self.lock: + request_ids = self.executor.enqueue_requests(executor_requests) + for req_id, request in zip(request_ids, triton_requests): + triton_id = request.request_id() + self.req_id_to_response_sender[req_id] = ( + triton_id, + request.get_response_sender(), + ) + self.triton_id_to_req_id[triton_id] = req_id + return None + + def awaiter_loop(self): + """Gets responses from executor and returns the results.""" + while self.running: + for response in self.executor.await_responses( + timeout=datetime.timedelta(milliseconds=1) + ): + req_id = response.request_id + with self.lock: + if req_id not in self.req_id_to_response_sender: + continue + triton_id, response_sender = self.req_id_to_response_sender[req_id] + + triton_response, is_final = convert_response(response) + response_sender.send( + triton_response, + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + if is_final + else 0, + ) + + if is_final: + with self.lock: + del self.triton_id_to_req_id[triton_id] + del self.req_id_to_response_sender[req_id] + # Remove local reference so response_sender can be cleaned properly. + del response_sender + + def cancellation_loop(self): + """Checks if any pending requests have been cancelled.""" + while self.running: + time.sleep(self.cancellation_check_period_ms / 1000.0) + with self.lock: + for req_id, ( + triton_id, + response_sender, + ) in self.req_id_to_response_sender.items(): + if response_sender.is_cancelled(): + self.executor.cancel_request(req_id) + # Remove local reference so response_sender can be cleaned properly. + del response_sender + + def metrics_loop(self): + """Updates triton metrics using stats from the executor.""" + while self.running: + time.sleep(self.stats_check_period_ms / 1000.0) + for stat in self.executor.get_latest_iteration_stats(): + try: + for key, metric in self.all_metrics.items(): + value = None + if hasattr(stat, key): + value = getattr(stat, key) + elif stat.kv_cache_stats is not None and hasattr( + stat.kv_cache_stats, key + ): + value = getattr(stat.kv_cache_stats, key) + elif stat.static_batching_stats is not None and hasattr( + stat.static_batching_stats, key + ): + value = getattr(stat.static_batching_stats, key) + elif stat.inflight_batching_stats is not None and hasattr( + stat.inflight_batching_stats, key + ): + value = getattr(stat.inflight_batching_stats, key) + if value is not None: + if key == "timestamp": + value = convert_timestamp_to_seconds(value) + metric.set(value) + else: + pb_utils.Logger.log_warn(f'Metric "{key}" not found.') + except Exception as e: + pb_utils.Logger.log_warn(f"Error while processing metrics: {e}") + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is optional. This function allows + the model to perform any necessary clean ups before exit. + """ + if self.executor.can_enqueue_requests(): + self.running = False + self.awaiter_thread.join() + self.cancellation_thread.join() + self.metrics_thread.join() + self.executor.shutdown() diff --git a/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm/config.pbtxt b/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm/config.pbtxt new file mode 100644 index 0000000000..7c9f294b89 --- /dev/null +++ b/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm/config.pbtxt @@ -0,0 +1,542 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "tensorrt_llm" +backend: "tensorrtllm" +#backend: "python" +max_batch_size: 256 + +model_transaction_policy { + decoupled: True +} + +dynamic_batching { + preferred_batch_size: [ 256 ] + max_queue_delay_microseconds: 1000 +} + +input [ + { + name: "input_ids" + data_type: TYPE_INT32 + dims: [ -1 ] + allow_ragged_batch: true + }, + { + name: "input_lengths" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + }, + { + name: "request_output_len" + data_type: TYPE_INT32 + dims: [ 1 ] + }, + { + name: "draft_input_ids" + data_type: TYPE_INT32 + dims: [ -1 ] + optional: true + allow_ragged_batch: true + }, + { + name: "decoder_input_ids" + data_type: TYPE_INT32 + dims: [ -1 ] + optional: true + allow_ragged_batch: true + }, + { + name: "decoder_input_lengths" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + reshape: { shape: [ ] } + }, + { + name: "draft_logits" + data_type: TYPE_FP32 + dims: [ -1, -1 ] + optional: true + allow_ragged_batch: true + }, + { + name: "draft_acceptance_threshold" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "end_id" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "pad_id" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "stop_words_list" + data_type: TYPE_INT32 + dims: [ 2, -1 ] + optional: true + allow_ragged_batch: true + }, + { + name: "bad_words_list" + data_type: TYPE_INT32 + dims: [ 2, -1 ] + optional: true + allow_ragged_batch: true + }, + { + name: "embedding_bias" + data_type: TYPE_FP32 + dims: [ -1 ] + optional: true + allow_ragged_batch: true + }, + { + name: "beam_width" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "temperature" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_k" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_p" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_p_min" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_p_decay" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_p_reset_ids" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "len_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "early_stopping" + data_type: TYPE_BOOL + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "repetition_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "min_length" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "beam_search_diversity_rate" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "presence_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "frequency_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "random_seed" + data_type: TYPE_UINT64 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "return_log_probs" + data_type: TYPE_BOOL + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "return_context_logits" + data_type: TYPE_BOOL + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "return_generation_logits" + data_type: TYPE_BOOL + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "stop" + data_type: TYPE_BOOL + dims: [ 1 ] + optional: true + }, + { + name: "streaming" + data_type: TYPE_BOOL + dims: [ 1 ] + optional: true + }, + { + name: "prompt_embedding_table" + data_type: TYPE_FP16 + dims: [ -1, -1 ] + optional: true + allow_ragged_batch: true + }, + { + name: "prompt_vocab_size" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + # the unique task ID for the given LoRA. + # To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given. + # The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`. + # If the cache is full the oldest LoRA will be evicted to make space for new ones. An error is returned if `lora_task_id` is not cached. + { + name: "lora_task_id" + data_type: TYPE_UINT64 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + # weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ] + # where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer + # each of the in / out tensors are first flattened and then concatenated together in the format above. + # D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out. + { + name: "lora_weights" + data_type: TYPE_FP16 + dims: [ -1, -1 ] + optional: true + allow_ragged_batch: true + }, + # module identifier (same size a first dimension of lora_weights) + # See LoraModule::ModuleType for model id mapping + # + # "attn_qkv": 0 # compbined qkv adapter + # "attn_q": 1 # q adapter + # "attn_k": 2 # k adapter + # "attn_v": 3 # v adapter + # "attn_dense": 4 # adapter for the dense layer in attention + # "mlp_h_to_4h": 5 # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection + # "mlp_4h_to_h": 6 # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection + # "mlp_gate": 7 # for llama2 adapter for gated mlp later after attention / RMSNorm: gate + # + # last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ] + { + name: "lora_config" + data_type: TYPE_INT32 + dims: [ -1, 3 ] + optional: true + allow_ragged_batch: true + } +] +output [ + { + name: "output_ids" + data_type: TYPE_INT32 + dims: [ -1, -1 ] + }, + { + name: "sequence_length" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "cum_log_probs" + data_type: TYPE_FP32 + dims: [ -1 ] + }, + { + name: "output_log_probs" + data_type: TYPE_FP32 + dims: [ -1, -1 ] + }, + { + name: "context_logits" + data_type: TYPE_FP32 + dims: [ -1, -1 ] + }, + { + name: "generation_logits" + data_type: TYPE_FP32 + dims: [ -1, -1, -1 ] + } +] +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] +parameters: { + key: "max_beam_width" + value: { + string_value: "${max_beam_width}" + } +} +parameters: { + key: "FORCE_CPU_ONLY_INPUT_TENSORS" + value: { + string_value: "no" + } +} +parameters: { + key: "gpt_model_type" + value: { + string_value: "inflight_fused_batching" + } +} +parameters: { + key: "gpt_model_path" + value: { + string_value: "/tmp/engines/llama-3-8b-instruct" + } +} +parameters: { + key: "encoder_model_path" + value: { + string_value: "${encoder_engine_dir}" + } +} +parameters: { + key: "max_tokens_in_paged_kv_cache" + value: { + string_value: "${max_tokens_in_paged_kv_cache}" + } +} +parameters: { + key: "max_attention_window_size" + value: { + string_value: "${max_attention_window_size}" + } +} +parameters: { + key: "sink_token_length" + value: { + string_value: "${sink_token_length}" + } +} +parameters: { + key: "batch_scheduler_policy" + value: { + string_value: "${batch_scheduler_policy}" + } +} +parameters: { + key: "kv_cache_free_gpu_mem_fraction" + value: { + string_value: "${kv_cache_free_gpu_mem_fraction}" + } +} +parameters: { + key: "kv_cache_host_memory_bytes" + value: { + string_value: "${kv_cache_host_memory_bytes}" + } +} +parameters: { + key: "kv_cache_onboard_blocks" + value: { + string_value: "${kv_cache_onboard_blocks}" + } +} +# enable_trt_overlap is deprecated and doesn't have any effect on the runtime +# parameters: { +# key: "enable_trt_overlap" +# value: { +# string_value: "${enable_trt_overlap}" +# } +# } +parameters: { + key: "exclude_input_in_output" + value: { + string_value: "True" + } +} +parameters: { + key: "cancellation_check_period_ms" + value: { + string_value: "${cancellation_check_period_ms}" + } +} +parameters: { + key: "stats_check_period_ms" + value: { + string_value: "${stats_check_period_ms}" + } +} +parameters: { + key: "iter_stats_max_iterations" + value: { + string_value: "${iter_stats_max_iterations}" + } +} +parameters: { + key: "request_stats_max_iterations" + value: { + string_value: "${request_stats_max_iterations}" + } +} +parameters: { + key: "enable_kv_cache_reuse" + value: { + string_value: "${enable_kv_cache_reuse}" + } +} +parameters: { + key: "normalize_log_probs" + value: { + string_value: "${normalize_log_probs}" + } +} +parameters: { + key: "enable_chunked_context" + value: { + string_value: "${enable_chunked_context}" + } +} +parameters: { + key: "gpu_device_ids" + value: { + string_value: "${gpu_device_ids}" + } +} +parameters: { + key: "lora_cache_optimal_adapter_size" + value: { + string_value: "${lora_cache_optimal_adapter_size}" + } +} +parameters: { + key: "lora_cache_max_adapter_size" + value: { + string_value: "${lora_cache_max_adapter_size}" + } +} +parameters: { + key: "lora_cache_gpu_memory_fraction" + value: { + string_value: "${lora_cache_gpu_memory_fraction}" + } +} +parameters: { + key: "lora_cache_host_memory_bytes" + value: { + string_value: "${lora_cache_host_memory_bytes}" + } +} +parameters: { + key: "decoding_mode" + value: { + string_value: "${decoding_mode}" + } +} +parameters: { + key: "executor_worker_path" + value: { + string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker" + } +} +parameters: { + key: "medusa_choices" + value: { + string_value: "${medusa_choices}" + } +} +parameters: { + key: "gpu_weights_percent" + value: { + string_value: "${gpu_weights_percent}" + } +} diff --git a/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/decode.py b/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/decode.py new file mode 100644 index 0000000000..c621cc14b4 --- /dev/null +++ b/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/decode.py @@ -0,0 +1,347 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from collections.abc import Generator +from dataclasses import dataclass +from typing import Optional + +import numpy as np + + +class RequestValidationError(Exception): + pass + + +def _validate_that(condition: bool, msg: str): + if not condition: + raise RequestValidationError(msg) + + +def _validate_non_empty(data, msg: str): + _validate_that(data is not None and data.size > 0, msg) + + +def _validate_single_gt_0(data, msg: str): + _validate_non_empty(data, msg) + _validate_that(data.flatten()[0] > 0, msg) + + +def _single_value(data: Optional[np.ndarray]): + if data is None: + return None + return data.flatten()[0] + + +@dataclass +class Request: + text_input: np.ndarray = np.array([]) + decoder_text_input: np.ndarray = None + max_tokens: np.ndarray = np.array([]) + bad_words: Optional[np.ndarray] = None + stop_words: Optional[np.ndarray] = None + end_id: Optional[np.ndarray] = None + pad_id: Optional[np.ndarray] = None + top_k: Optional[np.ndarray] = None + top_p: Optional[np.ndarray] = None + temperature: Optional[np.ndarray] = None + length_penalty: Optional[np.ndarray] = None + repetition_penalty: Optional[np.ndarray] = None + min_length: Optional[np.ndarray] = None + return_log_probs: Optional[np.ndarray] = None + prompt_embedding_table: Optional[np.ndarray] = None + prompt_vocab_size: Optional[np.ndarray] = None + embedding_bias_words: Optional[np.ndarray] = None + embedding_bias_weights: Optional[np.ndarray] = None + num_draft_tokens: Optional[np.ndarray] = None + use_draft_logits: Optional[np.ndarray] = None + stream: Optional[np.ndarray] = None + beam_width: Optional[np.ndarray] = None + return_context_logits: Optional[np.ndarray] = None + return_generation_logits: Optional[np.ndarray] = None + random_seed: Optional[np.ndarray] = None + presence_penalty: Optional[np.ndarray] = None + frequency_penalty: Optional[np.ndarray] = None + + def validate(self): + _validate_non_empty(self.text_input, "text_input is required") + _validate_single_gt_0(self.max_tokens, "max_tokens must be a single value > 0") + + num_draft_tokens = _single_value(self.num_draft_tokens) + stream = _single_value(self.stream) + _single_value(self.return_generation_logits) + context_logits = _single_value(self.return_context_logits) + + if num_draft_tokens: + _validate_that( + not stream, "streaming is not supported with speculative decoding" + ) + _validate_that( + not context_logits, + "context logits are not supported with speculative decoding", + ) + + +@dataclass +class DraftRequest: + draft_input_ids: Optional[np.ndarray] = None + draft_logits: Optional[np.ndarray] = None + + +@dataclass +class PreprocResponse: + input_ids: np.ndarray = np.array([]) + decoder_input_ids: np.ndarray = None + input_lengths: np.ndarray = np.array([]) + decoder_input_lengths: np.ndarray = None + bad_words_list: Optional[np.ndarray] = None + stop_words_list: Optional[np.ndarray] = None + embedding_bias: Optional[np.ndarray] = None + end_id: Optional[np.ndarray] = None + pad_id: Optional[np.ndarray] = None + + @classmethod + def with_new_inputs( + cls, + other, + input_ids: Optional[np.ndarray] = None, + input_lengths: Optional[np.ndarray] = None, + ): + return cls( + input_ids=(input_ids if input_ids is not None else other.input_ids), + input_lengths=( + input_lengths if input_lengths is not None else other.input_lengths + ), + decoder_input_ids=other.decoder_input_ids, + decoder_input_lengths=other.decoder_input_lengths, + bad_words_list=other.bad_words_list, + stop_words_list=other.stop_words_list, + end_id=other.end_id, + pad_id=other.pad_id, + ) + + +@dataclass +class GenerationResponse: + output_ids: np.ndarray = np.array([]) + sequence_length: np.ndarray = np.array([]) + cum_log_probs: Optional[np.ndarray] = None + output_log_probs: Optional[np.ndarray] = None + context_logits: Optional[np.ndarray] = None + generation_logits: Optional[np.ndarray] = None + + +@dataclass +class Response: + text_output: np.ndarray = np.array([]) + cum_log_probs: Optional[np.ndarray] = None + output_log_probs: Optional[np.ndarray] = None + context_logits: Optional[np.ndarray] = None + generation_logits: Optional[np.ndarray] = None + + def __eq__(self, o) -> bool: + """Just for testing""" + if not isinstance(o, Response): + return False + return ( + np.array_equal(self.text_output, o.text_output) + and np.array_equal(self.cum_log_probs, o.cum_log_probs) + and np.array_equal(self.output_log_probs, o.output_log_probs) + and np.array_equal(self.context_logits, o.context_logits) + and np.array_equal(self.generation_logits, o.generation_logits) + ) + + +class Decoder: + def __init__(self, streaming=False, accumulate=False): + self._streaming = streaming + self._accumulate = accumulate + + self._accumulated_tokens = None + + def decode( + self, request: Request, speculative_decoding=False + ) -> Generator[Response, None, None]: + preproc_response = self.preprocess(request) + + # print(f"[DEBUG] Decoder.decode {request.temperature=}") + if speculative_decoding: + for gen_response in self._spec_generate(preproc_response, request): + yield self.postprocess(gen_response) + else: + if not self._streaming: + gen_response = self._generate_non_streaming(preproc_response, request) + yield self.postprocess(gen_response) + else: + for gen_response in self._generate(preproc_response, request): + yield self.postprocess(gen_response) + + def encountered_stop_words(self, input_ids, stop_words_ids): + for stop_word_ids in stop_words_ids: + if np.array_equal(input_ids[-len(stop_word_ids) :], stop_word_ids): + return True + return False + + def _spec_generate( + self, preproc: PreprocResponse, request: Request + ) -> Generator[GenerationResponse, None, None]: + prompt_input_ids: np.ndarray = preproc.input_ids[0] + input_ids: np.ndarray = prompt_input_ids + output_len: int = request.max_tokens[0][0] + last_input_ids: np.ndarray = None + draft_output_ids: np.ndarray = None + draft_logits: np.ndarray = None + + target_response: GenerationResponse = None + + cur_preproc = preproc + + counter = 0 + while True: + counter += 1 + num_draft_tokens = min( + request.num_draft_tokens[0][0], + len(prompt_input_ids) + output_len - len(input_ids) - 1, + ) + + draft_request = None + if num_draft_tokens > 0: + draft_response: GenerationResponse = self._draft_generate_non_streaming( + cur_preproc, request, num_draft_tokens + ) + seq_len: int = draft_response.sequence_length[0][0] + # [1, beamWidth, outputLength] -> [outputLen] + draft_output_ids = draft_response.output_ids[0][0] + # [1, beamWidth, outputLength, vocabSizePadded] -> [outputLength, vocabSizePadded] + if request.use_draft_logits is not None and request.use_draft_logits[0]: + if draft_response.generation_logits is not None: + draft_logits = draft_response.generation_logits[0][0] + + input_draft_tokens = draft_output_ids[len(input_ids) : seq_len] + draft_request = DraftRequest( + draft_input_ids=np.expand_dims(input_draft_tokens, 0) + ) + if request.use_draft_logits is not None and request.use_draft_logits[0]: + draft_request.draft_logits = np.expand_dims( + draft_logits[-len(input_draft_tokens) :], 0 + ) + else: + draft_request = DraftRequest() + target_response = self._generate_non_streaming( + cur_preproc, request, draft_request + ) + last_input_ids = input_ids + input_ids = target_response.output_ids[0][0] + cur_preproc = PreprocResponse.with_new_inputs( + cur_preproc, + np.expand_dims(input_ids, 0), + np.array([[len(input_ids)]], dtype=np.int32), + ) + + # Evaluate criteria to stop generation loop. + # If we've hit or exceeded the max output length, should stop + length_stop = len(input_ids) >= len(prompt_input_ids) + output_len + if length_stop: + break + # If draft and target have same outputs, should stop. Normally target should return 1 more token. + # If they are the same length, they should differ at the last token + target_draft_equal = draft_output_ids is not None and np.array_equal( + draft_output_ids, input_ids + ) + if target_draft_equal: + break + # If tokens no longer change, should stop, means we have hit early stopping + last_current_equal = np.array_equal(last_input_ids, input_ids) + if last_current_equal: + break + # Need to check if stop words was encountered + hit_stop_words = self.encountered_stop_words( + input_ids, preproc.stop_words_list[0] + ) + if hit_stop_words: + break + + yield target_response + + def _draft_generate_non_streaming( + self, preproc: PreprocResponse, request: Request, num_draft_tokens: int + ) -> GenerationResponse: + raise NotImplementedError() + + def _generate( + self, + preproc: PreprocResponse, + request: Request, + draft_request: Optional[DraftRequest] = None, + ) -> Generator[GenerationResponse, None, None]: + raise NotImplementedError() + + def _generate_non_streaming( + self, + preproc: PreprocResponse, + request: Request, + draft_request: Optional[DraftRequest] = None, + ) -> GenerationResponse: + raise NotImplementedError() + + def postprocess(self, gen_response: GenerationResponse) -> Response: + if self._accumulate and self._streaming: + new_tokens: np.ndarray = gen_response.output_ids + if new_tokens.ndim != 3: + raise Exception("Expected output_ids tensor to have 3 dims.") + if new_tokens.shape[0] != 1: + raise Exception("Expected batch size of 1") + if new_tokens.shape[1] != 1: + raise Exception( + "Accumulation of tokens is only implemented for beam width = 1" + ) + + self._accumulated_tokens = ( + new_tokens + if (self._accumulated_tokens is None) + else np.concatenate((self._accumulated_tokens, new_tokens), axis=2) + ) + sequence_lengths = np.array( + [[self._accumulated_tokens.shape[2]]], dtype=np.int32 + ) + return self._postprocess( + self._accumulated_tokens, sequence_lengths, gen_response + ) + else: + return self._postprocess(gen_response.output_ids, None, gen_response) + + def _postprocess( + self, + tokens: np.ndarray, + sequence_lengths: Optional[np.ndarray], + gen_response: GenerationResponse, + ) -> Response: + raise NotImplementedError() + + def preprocess(self, request: Request) -> PreprocResponse: + raise NotImplementedError() + + def reset_decoder(self): + self._accumulated_tokens = None diff --git a/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/triton_decoder.py b/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/triton_decoder.py new file mode 100644 index 0000000000..62c06f4836 --- /dev/null +++ b/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/triton_decoder.py @@ -0,0 +1,478 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from collections.abc import Callable +from typing import Dict, Optional + +import numpy as np +import triton_python_backend_utils as pb_utils +from lib.decode import * +from typing_extensions import override + + +class TritonDecoder(Decoder): + def __init__( + self, + streaming=False, + accumulate=False, + preproc_model_name="preprocessing", + postproc_model_name="postprocessing", + llm_model_name="tensorrt_llm", + draft_llm_model_name: Optional[str] = None, + ): + super().__init__(streaming=streaming, accumulate=accumulate) + self.preproc_model_name = preproc_model_name + self.postproc_model_name = postproc_model_name + self.llm_model_name = llm_model_name + self.draft_llm_model_name = draft_llm_model_name + + self._preproc_outputs = [ + "INPUT_ID", + "DECODER_INPUT_ID", + "REQUEST_INPUT_LEN", + "REQUEST_DECODER_INPUT_LEN", + "BAD_WORDS_IDS", + "STOP_WORDS_IDS", + "EMBEDDING_BIAS", + "OUT_PAD_ID", + "OUT_END_ID", + ] + + self._llm_outputs = [ + "output_ids", + "sequence_length", + "cum_log_probs", + "output_log_probs", + "context_logits", + "generation_logits", + ] + + self._postproc_outputs = [ + "OUTPUT", + ] + + self.input_names = [ + "text_input", + "decoder_text_input", + "max_tokens", + "bad_words", + "stop_words", + "end_id", + "pad_id", + "top_k", + "top_p", + "temperature", + "length_penalty", + "repetition_penalty", + "min_length", + "presence_penalty", + "frequency_penalty", + "random_seed", + "return_log_probs", + "return_context_logits", + "return_generation_logits", + "beam_width", + "stream", + "prompt_embedding_table", + "prompt_vocab_size", + "embedding_bias_words", + "embedding_bias_weights", + "num_draft_tokens", + "use_draft_logits", + ] + + self.__undo_reshape_whitelist = { + "max_tokens", + "end_id", + "pad_id", + "top_k", + "top_p", + "temperature", + "length_penalty", + "repetition_penalty", + "min_length", + "presence_penalty", + "frequency_penalty", + "random_seed", + "return_log_probs", + "return_context_logits", + "return_generation_logits", + "beam_width", + "stream", + "prompt_vocab_size", + "num_draft_tokens", + "use_draft_logits", + } + + def _exec_triton_request(self, request): + responses = request.exec(decoupled=True) + for r in responses: + if r.has_error(): + raise pb_utils.TritonModelException(r.error().message()) + yield r + + def _exec_triton_request_single(self, request): + responses = request.exec(decoupled=False) + if responses.has_error(): + raise pb_utils.TritonModelException(responses.error().message()) + return responses + + def create_triton_response(self, response: Response): + name_map = { + "text_output": "text_output", + "cum_log_probs": "cum_log_probs", + "output_log_probs": "output_log_probs", + "context_logits": "context_logits", + "generation_logits": "generation_logits", + } + tensors = self.create_triton_tensors(response, name_map) + return pb_utils.InferenceResponse(output_tensors=tensors) + + def convert_triton_request(self, triton_request) -> Request: + request = Request() + for triton_name in self.input_names: + tensor = pb_utils.get_input_tensor_by_name(triton_request, triton_name) + target_name = triton_name + if tensor is None: + continue + if not hasattr(request, target_name): + raise AttributeError(f"Request has no attribute '{target_name}'") + setattr(request, target_name, tensor.as_numpy()) + return request + + def convert_triton_response( + self, triton_response, response_factory: Callable, name_map=None + ): + response = response_factory() + for tensor in triton_response.output_tensors(): + if tensor is None: + continue + triton_name = tensor.name() + value = tensor.as_numpy() + target_name = triton_name + if name_map and triton_name in name_map: + target_name = name_map[triton_name] + if name_map and not triton_name in name_map: + continue + if target_name is None: + # explicitly ignore this triton input + continue + if not hasattr(response, target_name): + raise AttributeError( + f"response object has not attribute '{target_name}'" + ) + setattr(response, target_name, value) + return response + + def __undo_reshape(self, x, name): + if name in self.__undo_reshape_whitelist and len(x.shape) == 1: + # handle reshapes + return np.expand_dims(x, 0) + else: + return x + + def create_triton_tensors(self, obj, name_map: dict): + tensors = [] + for name, triton_name in name_map.items(): + if triton_name is None: + continue + value = getattr(obj, name) + if value is None: + continue + t = pb_utils.Tensor(triton_name, self.__undo_reshape(value, name)) + tensors.append(t) + return tensors + + @override + def preprocess(self, request: Request) -> PreprocResponse: + input_tensors = self._get_preproc_tensors(request) + triton_req = pb_utils.InferenceRequest( + model_name=self.preproc_model_name, + inputs=input_tensors, + requested_output_names=self._preproc_outputs, + ) + triton_output = self._exec_triton_request_single(triton_req) + return self._get_preproc_response(triton_output) + + def _get_preproc_tensors(self, request: Request): + name_map = { + "text_input": "QUERY", + "decoder_text_input": "DECODER_QUERY", + "max_tokens": "REQUEST_OUTPUT_LEN", + "bad_words": "BAD_WORDS_DICT", + "stop_words": "STOP_WORDS_DICT", + "embedding_bias_words": "EMBEDDING_BIAS_WORDS", + "embedding_bias_weights": "EMBEDDING_BIAS_WEIGHTS", + "pad_id": "PAD_ID", + "end_id": "END_ID", + } + return self.create_triton_tensors(request, name_map) + + def _get_preproc_response(self, triton_output): + name_map = { + "INPUT_ID": "input_ids", + "DECODER_INPUT_ID": "decoder_input_ids", + "REQUEST_INPUT_LEN": "input_lengths", + "REQUEST_DECODER_INPUT_LEN": "decoder_input_lengths", + "BAD_WORDS_IDS": "bad_words_list", + "STOP_WORDS_IDS": "stop_words_list", + "EMBEDDING_BIAS": "embedding_bias", + "OUT_PAD_ID": "pad_id", + "OUT_END_ID": "end_id", + } + return self.convert_triton_response(triton_output, PreprocResponse, name_map) + + @override + def _draft_generate_non_streaming( + self, preproc: PreprocResponse, request: Request, num_draft_tokens: int + ) -> GenerationResponse: + input_tensors = self._get_llm_tensors( + preproc, request, num_draft_tokens, None, True + ) + triton_req = pb_utils.InferenceRequest( + model_name=self.draft_llm_model_name, + inputs=input_tensors, + requested_output_names=self._llm_outputs, + ) + triton_response = self._exec_triton_request_single(triton_req) + llm_response = self._get_llm_response(triton_response) + return llm_response + + @override + def _generate( + self, + preproc: PreprocResponse, + request: Request, + draft_request: Optional[DraftRequest] = None, + ) -> Generator[GenerationResponse, None, None]: + input_tensors = self._get_llm_tensors(preproc, request, None, draft_request) + triton_req = pb_utils.InferenceRequest( + model_name=self.llm_model_name, + inputs=input_tensors, + requested_output_names=self._llm_outputs, + ) + for r in self._exec_triton_request(triton_req): + yield self._get_llm_response(r) + + @override + def _generate_non_streaming( + self, + preproc: PreprocResponse, + request: Request, + draft_request: Optional[DraftRequest] = None, + ) -> GenerationResponse: + input_tensors = self._get_llm_tensors(preproc, request, None, draft_request) + triton_req = pb_utils.InferenceRequest( + model_name=self.llm_model_name, + inputs=input_tensors, + requested_output_names=self._llm_outputs, + ) + r = self._exec_triton_request_single(triton_req) + return self._get_llm_response(r) + + def _get_llm_tensors( + self, + preproc: PreprocResponse, + request: Request, + num_output_tokens: Optional[int] = None, + draft_request: Optional[DraftRequest] = None, + is_draft_model_request: bool = False, + ): + tensors = [] + # print(f"[get_llm_tensors] {request.temperature=}") + tensors.extend(self._get_tensors_from_preproc(preproc)) + tensors.extend( + self._get_llm_tensors_from_request( + request, num_output_tokens, draft_request, is_draft_model_request + ) + ) + return tensors + + def _get_tensors_from_preproc(self, preproc: PreprocResponse): + name_map = { + "input_ids": "input_ids", + "decoder_input_ids": "decoder_input_ids", + "input_lengths": "input_lengths", + "bad_words_list": "bad_words_list", + "stop_words_list": "stop_words_list", + "embedding_bias": "embedding_bias", + "pad_id": "pad_id", + "end_id": "end_id", + } + return self.create_triton_tensors(preproc, name_map) + + def _get_llm_tensors_from_request( + self, + request: Request, + num_output_tokens: Optional[int] = None, + draft_request: Optional[DraftRequest] = None, + is_draft_model_request: bool = False, + ): + name_map: Dict[str, Optional[str]] = { + "beam_width": "beam_width", + "top_k": "runtime_top_k", + "top_p": "runtime_top_p", + # "temperature": "temperature", + "length_penalty": "len_penalty", + "repetition_penalty": "repetition_penalty", + "min_length": "min_length", + "presence_penalty": "presence_penalty", + "frequency_penalty": "frequency_penalty", + "random_seed": "random_seed", + "return_log_probs": "return_log_probs", + "stream": "streaming", + "prompt_embedding_table": "prompt_embedding_table", + "prompt_vocab_size": "prompt_vocab_size", + } + # print(f"[get_llm_tensors_from_request] {request.temperature=}") + temp_found = "temperature" in name_map + # print(f"[get_llm_tensors_from_request] temperature in name_map = {temp_found}") + tensors = self.create_triton_tensors(request, name_map) + + out_len = request.max_tokens[0][0] if request.max_tokens else None + if num_output_tokens is not None: + out_len = num_output_tokens + elif draft_request: + if draft_request.draft_input_ids is not None: + out_len = len(draft_request.draft_input_ids[0]) + 1 + else: + out_len = 1 + + if out_len is None: + raise Exception("Could not determine request_output_len") + else: + tensors.append( + pb_utils.Tensor( + "request_output_len", np.array([[out_len]], dtype=np.int32) + ) + ) + + if draft_request: + if draft_request.draft_input_ids is not None: + tensors.append( + pb_utils.Tensor("draft_input_ids", draft_request.draft_input_ids) + ) + if ( + draft_request.draft_logits is not None + and request.use_draft_logits is not None + and request.use_draft_logits[0] + ): + tensors.append( + pb_utils.Tensor("draft_logits", draft_request.draft_logits) + ) + + return_context_logits = False + return_generation_logits = False + if draft_request is None: + if is_draft_model_request: + return_generation_logits = ( + request.use_draft_logits[0] + if request.use_draft_logits is not None + else False + ) + else: + return_context_logits = ( + request.return_context_logits[0] + if request.return_context_logits is not None + else False + ) + return_generation_logits = ( + request.return_generation_logits[0] + if request.return_generation_logits is not None + else False + ) + + tensors.append( + pb_utils.Tensor( + "return_context_logits", np.array([[return_context_logits]]) + ) + ) + tensors.append( + pb_utils.Tensor( + "return_generation_logits", np.array([[return_generation_logits]]) + ) + ) + return tensors + + def _get_llm_response(self, triton_output): + name_map = { + "output_ids": "output_ids", + "sequence_length": "sequence_length", + "cum_log_probs": "cum_log_probs", + "output_log_probs": "output_log_probs", + "context_logits": "context_logits", + "generation_logits": "generation_logits", + } + return self.convert_triton_response(triton_output, GenerationResponse, name_map) + + def _postprocess( + self, + tokens: np.ndarray, + sequence_lengths: Optional[np.ndarray], + gen_response: GenerationResponse, + ) -> Response: + input_tensors = self._get_postproc_tensors( + tokens, sequence_lengths, gen_response + ) + triton_req = pb_utils.InferenceRequest( + model_name=self.postproc_model_name, + inputs=input_tensors, + requested_output_names=self._postproc_outputs, + ) + r = self._exec_triton_request_single(triton_req) + response = self._get_response(r, gen_response) + return response + + def _get_postproc_tensors( + self, + tokens: np.ndarray, + sequence_lengths: Optional[np.ndarray], + gen_response: GenerationResponse, + ): + tensors = [ + pb_utils.Tensor("TOKENS_BATCH", tokens), + pb_utils.Tensor( + "SEQUENCE_LENGTH", + sequence_lengths if sequence_lengths else gen_response.sequence_length, + ), + ] + return tensors + + def _get_response(self, triton_output, gen_res: GenerationResponse): + tensors = triton_output.output_tensors() + t_map = {} + for named_t in tensors: + name = named_t.name() + t = named_t.as_numpy() + t_map[name] = t + response = Response( + text_output=t_map["OUTPUT"], + cum_log_probs=gen_res.cum_log_probs, + output_log_probs=gen_res.output_log_probs, + context_logits=gen_res.context_logits, + generation_logits=gen_res.generation_logits, + ) + return response diff --git a/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/1/model.py b/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/1/model.py new file mode 100644 index 0000000000..0a5d54546d --- /dev/null +++ b/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/1/model.py @@ -0,0 +1,137 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import traceback + +import triton_python_backend_utils as pb_utils +from lib.triton_decoder import TritonDecoder + + +class TritonPythonModel: + def initialize(self, args): + # Parse model configs + model_config = json.loads(args["model_config"]) + + params = model_config["parameters"] + + accumulate_tokens_str = "" + if "accumulate_tokens" in params: + accumulate_tokens_str = params["accumulate_tokens"]["string_value"] + + self.accumulate_tokens = accumulate_tokens_str.lower() in [ + "true", + "yes", + "1", + "t", + ] + + self.decoupled = pb_utils.using_decoupled_model_transaction_policy(model_config) + + self.logger = pb_utils.Logger + + self.llm_model_name = "tensorrt_llm" + if "tensorrt_llm_model_name" in params: + self.llm_model_name = params["tensorrt_llm_model_name"]["string_value"] + self.draft_llm_model_name = None + if "tensorrt_llm_draft_model_name" in params: + self.draft_llm_model_name = params["tensorrt_llm_draft_model_name"][ + "string_value" + ] + + self.decoder = TritonDecoder( + streaming=self.decoupled, + accumulate=self.accumulate_tokens, + preproc_model_name="preprocessing", + postproc_model_name="postprocessing", + llm_model_name=self.llm_model_name, + draft_llm_model_name=self.draft_llm_model_name, + ) + + def execute(self, requests): + responses = [] + + for request in requests: + if self.decoupled: + response_sender = request.get_response_sender() + try: + req = self.decoder.convert_triton_request(request) + req.validate() + # print(f"[DEBUG] ========= [bls model.py] {req.temperature=} ===========") + speculative_decode = ( + req.num_draft_tokens is not None and req.num_draft_tokens[0][0] > 0 + ) + if speculative_decode and ( + self.draft_llm_model_name is None or self.draft_llm_model_name == "" + ): + raise Exception( + "cannot perform speculative decoding without draft model" + ) + res_gen = self.decoder.decode( + req, speculative_decoding=speculative_decode + ) + + for res in res_gen: + triton_response = self.decoder.create_triton_response(res) + if self.decoupled: + response_sender.send(triton_response) + else: + responses.append(triton_response) + + if self.decoupled: + response_sender.send( + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) + + except Exception: + self.logger.log_error(traceback.format_exc()) + # If encountering an error, send a response with err msg + error_response = pb_utils.InferenceResponse( + output_tensors=[], + error=pb_utils.TritonError(traceback.format_exc()), + ) + + if self.decoupled: + response_sender.send(error_response) + response_sender.send( + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) + else: + responses.append(error_response) + + self.decoder.reset_decoder() + if self.decoupled: + return None + else: + assert len(responses) == len(requests) + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is optional. This function allows + the model to perform any necessary clean ups before exit. + """ + print("Cleaning up...") diff --git a/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/config.pbtxt b/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/config.pbtxt new file mode 100644 index 0000000000..aa3b26336c --- /dev/null +++ b/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/config.pbtxt @@ -0,0 +1,252 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +backend: "python" +max_batch_size: 256 + +model_transaction_policy { + decoupled: True +} + +input [ + { + name: "text_input" + data_type: TYPE_STRING + dims: [ -1 ] + }, + { + name: "decoder_text_input" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true + }, + { + name: "max_tokens" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "bad_words" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true + }, + { + name: "stop_words" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true + }, + { + name: "end_id" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + }, + { + name: "pad_id" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + }, + { + name: "top_k" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + }, + { + name: "top_p" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "temperature" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "length_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "repetition_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "min_length" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + }, + { + name: "presence_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "frequency_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "random_seed" + data_type: TYPE_UINT64 + dims: [ 1 ] + optional: true + }, + { + name: "return_log_probs" + data_type: TYPE_BOOL + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "return_context_logits" + data_type: TYPE_BOOL + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "return_generation_logits" + data_type: TYPE_BOOL + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "beam_width" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + }, + { + name: "stream" + data_type: TYPE_BOOL + dims: [ 1 ] + optional: true + }, + { + name: "prompt_embedding_table" + data_type: TYPE_FP16 + dims: [ -1, -1 ] + optional: true + }, + { + name: "prompt_vocab_size" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + }, + { + name: "embedding_bias_words" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true + }, + { + name: "embedding_bias_weights" + data_type: TYPE_FP32 + dims: [ -1 ] + optional: true + }, + { + name: "num_draft_tokens", + data_type: TYPE_INT32, + dims: [ 1 ] + optional: true + }, + { + name: "use_draft_logits", + data_type: TYPE_BOOL, + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + } +] +output [ + { + name: "text_output" + data_type: TYPE_STRING + dims: [ -1 ] + }, + { + name: "cum_log_probs" + data_type: TYPE_FP32 + dims: [ -1 ] + }, + { + name: "output_log_probs" + data_type: TYPE_FP32 + dims: [ -1, -1 ] + }, + { + name: "context_logits" + data_type: TYPE_FP32 + dims: [ -1, -1 ] + }, + { + name: "generation_logits" + data_type: TYPE_FP32 + dims: [ -1, -1, -1 ] + } +] + +parameters: { + key: "accumulate_tokens" + value: { + string_value: "${accumulate_tokens}" + } +} +parameters: { + key: "tensorrt_llm_model_name" + value: { + string_value: "tensorrt_llm" + } +} +parameters: { + key: "tensorrt_llm_draft_model_name" + value: { + string_value: "" + } +} + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] diff --git a/python/openai/openai/tests/test_chat_completions.py b/python/openai/openai/tests/test_chat_completions.py new file mode 100644 index 0000000000..7e5548252a --- /dev/null +++ b/python/openai/openai/tests/test_chat_completions.py @@ -0,0 +1,447 @@ +import copy +from pathlib import Path +from typing import List + +import pytest +from fastapi.testclient import TestClient +from tests.utils import setup_fastapi_app + + +class TestChatCompletions: + @pytest.fixture(scope="class") + def client(self, fastapi_client_class_scope): + yield fastapi_client_class_scope + + def test_chat_completions_defaults(self, client, model: str, messages: List[dict]): + response = client.post( + "/v1/chat/completions", + json={"model": model, "messages": messages}, + ) + + assert response.status_code == 200 + message = response.json()["choices"][0]["message"] + assert message["content"].strip() + assert message["role"] == "assistant" + # "usage" currently not supported + assert response.json()["usage"] == None + + def test_chat_completions_system_prompt(self, client, model: str): + # NOTE: Currently just sanity check that there are no issues when a + # system role is provided. There is no test logic to measure the quality + # of the response yet. + messages = [ + {"role": "system", "content": "You are a Triton Inference Server expert."}, + {"role": "user", "content": "What is machine learning?"}, + ] + + response = client.post( + "/v1/chat/completions", json={"model": model, "messages": messages} + ) + + assert response.status_code == 200 + message = response.json()["choices"][0]["message"] + assert message["content"].strip() + assert message["role"] == "assistant" + + def test_chat_completions_system_prompt_only(self, client, model: str): + # No user prompt provided + messages = [ + {"role": "system", "content": "You are a Triton Inference Server expert."} + ] + + response = client.post( + "/v1/chat/completions", json={"model": model, "messages": messages} + ) + + assert response.status_code == 200 + message = response.json()["choices"][0]["message"] + assert message["content"].strip() + assert message["role"] == "assistant" + + @pytest.mark.parametrize( + "param_key, param_value", + [ + ("temperature", 0.7), + ("max_tokens", 10), + ("top_p", 0.9), + ("frequency_penalty", 0.5), + ("presence_penalty", 0.2), + # logprobs is a boolean for chat completions + ("logprobs", True), + ("logit_bias", {"0": 0}), + ], + ) + def test_chat_completions_sampling_parameters( + self, client, param_key, param_value, model: str, messages: List[dict] + ): + response = client.post( + "/v1/chat/completions", + json={ + "model": model, + "messages": messages, + param_key: param_value, + }, + ) + + # TODO: Add support and remove this check + unsupported_parameters = ["logprobs", "logit_bias"] + if param_key in unsupported_parameters: + assert response.status_code == 400 + assert response.json()["detail"] == "logit bias and log probs not supported" + return + + assert response.status_code == 200 + assert response.json()["choices"][0]["message"]["content"] + assert response.json()["choices"][0]["message"]["role"] == "assistant" + + @pytest.mark.parametrize( + "param_key, param_value", + [ + ("temperature", 2.1), + ("temperature", -0.1), + ("max_tokens", -1), + ("top_p", 1.1), + ("frequency_penalty", 3), + ("frequency_penalty", -3), + ("presence_penalty", 2.1), + ("presence_penalty", -2.1), + ], + ) + def test_chat_completions_invalid_sampling_parameters( + self, client, param_key, param_value, model: str, messages: List[dict] + ): + response = client.post( + "/v1/chat/completions", + json={ + "model": model, + "messages": messages, + param_key: param_value, + }, + ) + + print("Response:", response.json()) + assert response.status_code == 422 + + # Simple tests to verify max_tokens roughly behaves as expected + def test_chat_completions_max_tokens( + self, client, model: str, messages: List[dict] + ): + responses = [] + payload = {"model": model, "messages": messages, "max_tokens": 1} + + # Send two requests with max_tokens = 1 to check their similarity + payload["max_tokens"] = 1 + responses.append( + client.post( + "/v1/chat/completions", + json=payload, + ) + ) + responses.append( + client.post( + "/v1/chat/completions", + json=payload, + ) + ) + # Send one requests with larger max_tokens to check its dis-similarity + payload["max_tokens"] = 100 + responses.append( + client.post( + "/v1/chat/completions", + json=payload, + ) + ) + + for response in responses: + print("Response:", response.json()) + assert response.status_code == 200 + + response1_text = ( + responses[0].json()["choices"][0]["message"]["content"].strip().split() + ) + response2_text = ( + responses[1].json()["choices"][0]["message"]["content"].strip().split() + ) + response3_text = ( + responses[2].json()["choices"][0]["message"]["content"].strip().split() + ) + # Simplification: One token shouldn't be more than one space-delimited word + assert len(response1_text) == len(response2_text) == 1 + assert len(response3_text) > len(response1_text) + + @pytest.mark.parametrize( + "temperature", + [0.0, 1.0], + ) + # Simple tests to verify temperature roughly behaves as expected + def test_chat_completions_temperature_vllm( + self, client, temperature, backend: str, model: str, messages: List[dict] + ): + if backend != "vllm": + pytest.skip(reason="Only used to test vLLM-specific temperature behavior") + + responses = [] + payload = { + "model": model, + "messages": messages, + "max_tokens": 256, + "temperature": temperature, + } + + responses.append( + client.post( + "/v1/chat/completions", + json=payload, + ) + ) + responses.append( + client.post( + "/v1/chat/completions", + json=payload, + ) + ) + + for response in responses: + print("Response:", response.json()) + assert response.status_code == 200 + + response1_text = ( + responses[0].json()["choices"][0]["message"]["content"].strip().split() + ) + response2_text = ( + responses[1].json()["choices"][0]["message"]["content"].strip().split() + ) + + # Temperature of 0.0 indicates greedy sampling, so check + # that two equivalent requests produce the same response. + if temperature == 0.0: + # NOTE: This check may be ambitious to get an exact match in all + # cases depending on how other parameter defaults are set, so + # it can probably be removed if it introduces flakiness. + assert response1_text == response2_text + # Temperature of 1.0 indicates maximum randomness, so check + # that two equivalent requests produce different responses. + elif temperature == 1.0: + assert response1_text != response2_text + # Don't bother checking values other than the extremes + else: + raise ValueError(f"Unexpected {temperature=} for this test.") + + # Remove xfail when fix is released and this test returns xpass status + @pytest.mark.xfail( + reason="TRT-LLM BLS model will ignore temperature until a later release" + ) + # Simple tests to verify temperature roughly behaves as expected + def test_chat_completions_temperature_tensorrtllm( + self, client, backend: str, model: str, messages: List[dict] + ): + if backend != "tensorrtllm": + pytest.skip( + reason="Only used to test TRT-LLM-specific temperature behavior" + ) + + responses = [] + payload1 = { + "model": model, + "messages": messages, + # Increase token length to allow more room for variability + "max_tokens": 200, + "temperature": 0.0, + # TRT-LLM requires certain settings of `top_k` / `top_p` to + # respect changes in `temperature` + "top_p": 0.5, + } + + payload2 = copy.deepcopy(payload1) + payload2["temperature"] = 1.0 + + # First 2 responses should be the same in TRT-LLM with identical payload + responses.append( + client.post( + "/v1/chat/completions", + json=payload1, + ) + ) + responses.append( + client.post( + "/v1/chat/completions", + json=payload1, + ) + ) + # Third response should differ with different temperature in payload + responses.append( + client.post( + "/v1/chat/completions", + json=payload2, + ) + ) + + for response in responses: + print("Response:", response.json()) + assert response.status_code == 200 + + response1_text = ( + responses[0].json()["choices"][0]["message"]["content"].strip().split() + ) + response2_text = ( + responses[1].json()["choices"][0]["message"]["content"].strip().split() + ) + response3_text = ( + responses[2].json()["choices"][0]["message"]["content"].strip().split() + ) + + assert response1_text == response2_text + assert response1_text != response3_text + + # Simple tests to verify random seed roughly behaves as expected + def test_chat_completions_seed(self, client, model: str, messages: List[dict]): + responses = [] + payload1 = { + "model": model, + "messages": messages, + # Increase token length to allow more room for variability + "max_tokens": 200, + "seed": 1, + } + payload2 = copy.deepcopy(payload1) + payload2["seed"] = 2 + + # First 2 responses should be the same in both vLLM and TRT-LLM with identical seed + responses.append( + client.post( + "/v1/chat/completions", + json=payload1, + ) + ) + responses.append( + client.post( + "/v1/chat/completions", + json=payload1, + ) + ) + # Third response should differ with different seed in payload + responses.append( + client.post( + "/v1/chat/completions", + json=payload2, + ) + ) + + for response in responses: + print("Response:", response.json()) + assert response.status_code == 200 + + response1_text = ( + responses[0].json()["choices"][0]["message"]["content"].strip().split() + ) + response2_text = ( + responses[1].json()["choices"][0]["message"]["content"].strip().split() + ) + response3_text = ( + responses[2].json()["choices"][0]["message"]["content"].strip().split() + ) + + assert response1_text == response2_text + assert response1_text != response3_text + + def test_chat_completions_no_message( + self, client, model: str, messages: List[dict] + ): + # Message validation requires min_length of 1 + messages = [] + response = client.post( + "/v1/chat/completions", json={"model": model, "messages": messages} + ) + assert response.status_code == 422 + assert ( + response.json()["detail"][0]["msg"] + == "List should have at least 1 item after validation, not 0" + ) + + def test_chat_completions_empty_message( + self, client, model: str, messages: List[dict] + ): + # Message validation requires min_length of 1 + messages = [{}] + response = client.post( + "/v1/chat/completions", json={"model": model, "messages": messages} + ) + assert response.status_code == 422 + assert response.json()["detail"][0]["msg"] == "Field required" + + def test_chat_completions_multiple_choices( + self, client, model: str, messages: List[dict] + ): + response = client.post( + "/v1/chat/completions", + json={"model": model, "messages": messages, "n": 2}, + ) + + assert response.status_code == 400 + assert response.json()["detail"] == "Only single choice is supported" + + @pytest.mark.skip(reason="Not Implemented Yet") + def test_chat_completions_streaming(self, client): + pass + + def test_chat_completions_no_streaming( + self, client, model: str, messages: List[dict] + ): + response = client.post( + "/v1/chat/completions", + json={"model": model, "messages": messages, "stream": False}, + ) + + assert response.status_code == 200 + message = response.json()["choices"][0]["message"] + assert message["content"].strip() + assert message["role"] == "assistant" + + @pytest.mark.skip(reason="Not Implemented Yet") + def test_function_calling(self): + pass + + @pytest.mark.skip(reason="Not Implemented Yet") + def test_lora(self): + pass + + @pytest.mark.skip(reason="Not Implemented Yet") + def test_multi_lora(self): + pass + + @pytest.mark.skip(reason="Not Implemented Yet") + def test_request_n_choices(self): + pass + + @pytest.mark.skip(reason="Not Implemented Yet") + def test_request_logprobs(self): + pass + + @pytest.mark.skip(reason="Not Implemented Yet") + def test_request_logit_bias(self): + pass + + # TODO: Do we want to support "usage" field for token counts in response? + @pytest.mark.skip(reason="Not Implemented Yet") + def test_usage_response(self): + pass + + +# For tests that won't use the same pytest fixture for server startup across +# the whole class test suite. +class TestChatCompletionsCustomFixture: + # A TOKENIZER must be known for /chat/completions endpoint in order to + # apply chat templates, and for simplicity in determination, users should + # define the TOKENIZER. So, explicitly raise an error if none is provided. + def test_chat_completions_no_tokenizer( + self, backend: str, model: str, messages: List[dict] + ): + model_repository = str(Path(__file__).parent / f"{backend}_models") + app = setup_fastapi_app(model_repository=model_repository, tokenizer="") + with TestClient(app) as client: + response = client.post( + "/v1/chat/completions", + json={"model": model, "messages": messages}, + ) + assert response.status_code == 400 + assert response.json()["detail"] == "Unknown tokenizer" diff --git a/python/openai/openai/tests/test_completions.py b/python/openai/openai/tests/test_completions.py new file mode 100644 index 0000000000..e43e225988 --- /dev/null +++ b/python/openai/openai/tests/test_completions.py @@ -0,0 +1,321 @@ +import copy + +import pytest + + +class TestCompletions: + @pytest.fixture(scope="class") + def client(self, fastapi_client_class_scope): + yield fastapi_client_class_scope + + def test_completions_defaults(self, client, model: str, prompt: str): + response = client.post( + "/v1/completions", + json={"model": model, "prompt": prompt}, + ) + + print("Response:", response.json()) + assert response.status_code == 200 + # NOTE: Could be improved to look for certain quality of response, + # or tested with dummy identity model. + assert response.json()["choices"][0]["text"].strip() + # "usage" currently not supported + assert response.json()["usage"] == None + + @pytest.mark.parametrize( + "sampling_parameter, value", + [ + ("temperature", 0.7), + ("max_tokens", 10), + ("top_p", 0.9), + ("frequency_penalty", 0.5), + ("presence_penalty", 0.2), + # logprobs is an integer for completions + ("logprobs", 5), + ("logit_bias", {"0": 0}), + ], + ) + def test_completions_sampling_parameters( + self, client, sampling_parameter, value, model: str, prompt: str + ): + response = client.post( + "/v1/completions", + json={ + "model": model, + "prompt": prompt, + sampling_parameter: value, + }, + ) + print("Response:", response.json()) + + # TODO: Add support and remove this check + unsupported_parameters = ["logprobs", "logit_bias"] + if sampling_parameter in unsupported_parameters: + assert response.status_code == 400 + assert response.json()["detail"] == "logit bias and log probs not supported" + return + + assert response.status_code == 200 + assert response.json()["choices"][0]["text"].strip() + + # Simple tests to verify max_tokens roughly behaves as expected + def test_completions_max_tokens(self, client, model: str, prompt: str): + responses = [] + payload = {"model": model, "prompt": prompt, "max_tokens": 1} + + # Send two requests with max_tokens = 1 to check their similarity + payload["max_tokens"] = 1 + responses.append( + client.post( + "/v1/completions", + json=payload, + ) + ) + responses.append( + client.post( + "/v1/completions", + json=payload, + ) + ) + # Send one requests with larger max_tokens to check its dis-similarity + payload["max_tokens"] = 100 + responses.append( + client.post( + "/v1/completions", + json=payload, + ) + ) + + for response in responses: + print("Response:", response.json()) + assert response.status_code == 200 + + response1_text = responses[0].json()["choices"][0]["text"].strip().split() + response2_text = responses[1].json()["choices"][0]["text"].strip().split() + response3_text = responses[2].json()["choices"][0]["text"].strip().split() + # Simplification: One token shouldn't be more than one space-delimited word + assert len(response1_text) == len(response2_text) == 1 + assert len(response3_text) > len(response1_text) + + @pytest.mark.parametrize( + "temperature", + [0.0, 1.0], + ) + # Simple tests to verify temperature roughly behaves as expected + def test_completions_temperature_vllm( + self, client, temperature, backend: str, model: str, prompt: str + ): + if backend != "vllm": + pytest.skip(reason="Only used to test vLLM-specific temperature behavior") + + responses = [] + payload = { + "model": model, + "prompt": prompt, + "temperature": temperature, + } + + responses.append( + client.post( + "/v1/completions", + json=payload, + ) + ) + responses.append( + client.post( + "/v1/completions", + json=payload, + ) + ) + + for response in responses: + print("Response:", response.json()) + assert response.status_code == 200 + + response1_text = responses[0].json()["choices"][0]["text"].strip().split() + response2_text = responses[1].json()["choices"][0]["text"].strip().split() + + # Temperature of 0.0 indicates greedy sampling, so check + # that two equivalent requests produce the same response. + if temperature == 0.0: + # NOTE: This check may be ambitious to get an exact match in all + # frameworks depending on how other parameter defaults are set, so + # it can probably be removed if it introduces flakiness. + print(f"Comparing '{response1_text}' == '{response2_text}'") + assert response1_text == response2_text + # Temperature of 1.0 indicates maximum randomness, so check + # that two equivalent requests produce different responses. + elif temperature == 1.0: + print(f"Comparing '{response1_text}' != '{response2_text}'") + assert response1_text != response2_text + # Don't bother checking values other than the extremes + else: + raise ValueError(f"Unexpected {temperature=} for this test.") + + # Remove xfail when fix is released and this test returns xpass status + @pytest.mark.xfail( + reason="TRT-LLM BLS model will ignore temperature until a later release" + ) + # Simple tests to verify temperature roughly behaves as expected + def test_completions_temperature_tensorrtllm( + self, client, backend: str, model: str, prompt: str + ): + if backend != "tensorrtllm": + pytest.skip(reason="Only used to test vLLM-specific temperature behavior") + + responses = [] + payload1 = { + "model": model, + "prompt": prompt, + "temperature": 0.0, + # TRT-LLM requires certain settings of `top_k` / `top_p` to + # respect changes in `temperature` + "top_p": 0.5, + } + payload2 = copy.deepcopy(payload1) + payload2["temperature"] = 1.0 + + # First 2 responses should be the same in TRT-LLM with identical payload + responses.append( + client.post( + "/v1/completions", + json=payload1, + ) + ) + responses.append( + client.post( + "/v1/completions", + json=payload1, + ) + ) + # Third response should differ with different temperature in payload + responses.append( + client.post( + "/v1/completions", + json=payload2, + ) + ) + + for response in responses: + print("Response:", response.json()) + assert response.status_code == 200 + + response1_text = responses[0].json()["choices"][0]["text"].strip().split() + response2_text = responses[1].json()["choices"][0]["text"].strip().split() + response3_text = responses[2].json()["choices"][0]["text"].strip().split() + + assert response1_text == response2_text + assert response1_text != response3_text + + # Simple tests to verify seed roughly behaves as expected + def test_completions_seed(self, client, model: str, prompt: str): + responses = [] + payload1 = {"model": model, "prompt": prompt, "seed": 1} + payload2 = copy.deepcopy(payload1) + payload2["seed"] = 2 + + # First 2 responses should be the same in TRT-LLM with identical payload + responses.append( + client.post( + "/v1/completions", + json=payload1, + ) + ) + responses.append( + client.post( + "/v1/completions", + json=payload1, + ) + ) + # Third response should differ with different temperature in payload + responses.append( + client.post( + "/v1/completions", + json=payload2, + ) + ) + + for response in responses: + print("Response:", response.json()) + assert response.status_code == 200 + + response1_text = responses[0].json()["choices"][0]["text"].strip().split() + response2_text = responses[1].json()["choices"][0]["text"].strip().split() + response3_text = responses[2].json()["choices"][0]["text"].strip().split() + + assert response1_text == response2_text + assert response1_text != response3_text + + @pytest.mark.parametrize( + "sampling_parameter, value", + [ + ("temperature", 2.1), + ("temperature", -0.1), + ("max_tokens", -1), + ("top_p", 1.1), + ("frequency_penalty", 3), + ("frequency_penalty", -3), + ("presence_penalty", 2.1), + ("presence_penalty", -2.1), + ], + ) + def test_completions_invalid_sampling_parameters( + self, client, sampling_parameter, value, model: str, prompt: str + ): + response = client.post( + "/v1/completions", + json={ + "model": model, + "prompt": prompt, + sampling_parameter: value, + }, + ) + + print("Response:", response.json()) + assert response.status_code == 422 + + def test_completions_empty_request(self, client): + response = client.post("/v1/completions", json={}) + assert response.status_code == 422 + + def test_completions_no_model(self, client, prompt: str): + response = client.post("/v1/completions", json={"prompt": prompt}) + assert response.status_code == 422 + + def test_completions_no_prompt(self, client, model: str): + response = client.post("/v1/completions", json={"model": model}) + assert response.status_code == 422 + + def test_completions_empty_prompt(self, client, model: str): + response = client.post("/v1/completions", json={"model": model, "prompt": ""}) + + # NOTE: Should this be validated in schema instead? + # 400 Error returned in route handler + assert response.status_code == 400 + + def test_no_prompt(self, client, model: str): + response = client.post("/v1/completions", json={"model": model}) + + # 422 Error returned by schema validation + assert response.status_code == 422 + + def test_completions_multiple_choices(self, client, model: str, prompt: str): + response = client.post( + "/v1/completions", json={"model": model, "prompt": prompt, "n": 2} + ) + + assert response.status_code == 400 + assert response.json()["detail"] == "Only single choice is supported" + + @pytest.mark.skip(reason="Not Implemented Yet") + def test_lora(self): + pass + + @pytest.mark.skip(reason="Not Implemented Yet") + def test_multi_lora(self): + pass + + # TODO: Do we want to support "usage" field for token counts in response? + @pytest.mark.skip(reason="Not Implemented Yet") + def test_usage_response(self): + pass diff --git a/python/openai/openai/tests/test_models/mock_llm/1/model.py b/python/openai/openai/tests/test_models/mock_llm/1/model.py new file mode 100644 index 0000000000..1cf5f3613c --- /dev/null +++ b/python/openai/openai/tests/test_models/mock_llm/1/model.py @@ -0,0 +1,108 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import time + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def initialize(self, args): + self.model_config = json.loads(args["model_config"]) + self.decoupled = self.model_config.get("model_transaction_policy", {}).get( + "decoupled" + ) + + def execute(self, requests): + if self.decoupled: + return self.exec_decoupled(requests) + else: + return self.exec(requests) + + def exec(self, requests): + responses = [] + for request in requests: + params = json.loads(request.parameters()) + rep_count = params["REPETITION"] if "REPETITION" in params else 1 + + input_np = pb_utils.get_input_tensor_by_name( + request, "text_intpu" + ).as_numpy() + stream_np = pb_utils.get_input_tensor_by_name(request, "stream").as_numpy() + stream = stream_np.flatten()[0] + if stream: + responses.append( + pb_utils.InferenceResponse( + error=pb_utils.TritonError( + "STREAM only supported in decoupled mode" + ) + ) + ) + else: + out_tensor = pb_utils.Tensor( + "text_output", np.repeat(input_np, rep_count, axis=1) + ) + responses.append(pb_utils.InferenceResponse([out_tensor])) + return responses + + def exec_decoupled(self, requests): + for request in requests: + params = json.loads(request.parameters()) + rep_count = params["REPETITION"] if "REPETITION" in params else 1 + fail_last = params["FAIL_LAST"] if "FAIL_LAST" in params else False + delay = params["DELAY"] if "DELAY" in params else None + + sender = request.get_response_sender() + input_np = pb_utils.get_input_tensor_by_name( + request, "text_input" + ).as_numpy() + stream_np = pb_utils.get_input_tensor_by_name(request, "stream").as_numpy() + out_tensor = pb_utils.Tensor("text_output", input_np) + response = pb_utils.InferenceResponse([out_tensor]) + # If stream enabled, just send multiple copies of response + # FIXME: Could split up response string into tokens, but this is simpler for now. + stream = stream_np.flatten()[0] + if stream: + for _ in range(rep_count): + if delay is not None: + time.sleep(delay) + sender.send(response) + sender.send( + None + if not fail_last + else pb_utils.InferenceResponse( + error=pb_utils.TritonError("An Error Occurred") + ), + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL, + ) + # If stream disabled, just send one response + else: + sender.send( + response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) + return None diff --git a/python/openai/openai/tests/test_models/mock_llm/config.pbtxt b/python/openai/openai/tests/test_models/mock_llm/config.pbtxt new file mode 100644 index 0000000000..5f665ff543 --- /dev/null +++ b/python/openai/openai/tests/test_models/mock_llm/config.pbtxt @@ -0,0 +1,60 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +backend: "python" + +max_batch_size: 0 + +model_transaction_policy { + decoupled: True +} + +input [ + { + name: "text_input" + data_type: TYPE_STRING + dims: [ 1, 1 ] + }, + { + name: "stream" + data_type: TYPE_BOOL + dims: [ 1, 1 ] + } +] + +output [ + { + name: "text_output" + data_type: TYPE_STRING + dims: [ 1, -1 ] + } +] + +instance_group [ + { + count: 1 + kind: KIND_MODEL + } +] diff --git a/python/openai/openai/tests/test_observability.py b/python/openai/openai/tests/test_observability.py new file mode 100644 index 0000000000..eca88a03de --- /dev/null +++ b/python/openai/openai/tests/test_observability.py @@ -0,0 +1,71 @@ +import os +from pathlib import Path + +import pytest +from fastapi.testclient import TestClient +from src.api_server import init_app + + +# Override conftest.py default model +@pytest.fixture +def model(): + return "mock_llm" + + +class TestObservability: + @pytest.fixture(scope="class") + def client(self): + model_repository = Path(__file__).parent / "test_models" + os.environ["TRITON_MODEL_REPOSITORY"] = str(model_repository) + app = init_app() + with TestClient(app) as test_client: + yield test_client + + ### General Error Handling ### + def test_not_found(self, client): + response = client.get("/does-not-exist") + assert response.status_code == 404 + + ### Startup / Health ### + def test_startup_success(self, client): + response = client.get("/health") + assert response.status_code == 200 + + def test_startup_fail(self): + os.environ["TRITON_MODEL_REPOSITORY"] = "/does/not/exist" + with pytest.raises(Exception): + # Test that FastAPI lifespan startup fails when initializing Triton + # with unknown model repository. + app = init_app() + with TestClient(app): + pass + + ### Metrics ### + def test_startup_metrics(self, client): + response = client.get("/metrics") + assert response.status_code == 200 + # FIXME: Flesh out more + # NOTE: response.json() works even on non-json prometheus data + assert "nv_cpu_utilization" in response.json() + + ### Models ### + def test_models_list(self, client, model): + # TODO: Load multiple models and make sure exactly ALL are returned + response = client.get("/v1/models") + assert response.status_code == 200 + models = response.json()["data"] + assert len(models) == 1 + assert models[0]["id"] == model + assert models[0]["object"] == "model" + assert models[0]["created"] > 0 + assert models[0]["owned_by"] == "Triton Inference Server" + + def test_models_get(self, client, model): + # TODO: Load multiple models and make sure exactly 1 is returned + response = client.get(f"/v1/models/{model}") + assert response.status_code == 200 + model_resp = response.json() + assert model_resp["id"] == model + assert model_resp["object"] == "model" + assert model_resp["created"] > 0 + assert model_resp["owned_by"] == "Triton Inference Server" diff --git a/python/openai/openai/tests/test_openai_client.py b/python/openai/openai/tests/test_openai_client.py new file mode 100644 index 0000000000..6c61403e73 --- /dev/null +++ b/python/openai/openai/tests/test_openai_client.py @@ -0,0 +1,163 @@ +from typing import List + +import openai +import pytest + + +class TestOpenAIClient: + @pytest.fixture(scope="class") + def client(self, server): + return server.get_client() + + def test_openai_client_models(self, client: openai.OpenAI, backend: str): + models = list(client.models.list()) + print(f"Models: {models}") + if backend == "tensorrtllm": + # ensemble or tensorrt_llm_bls + # preprocess -> tensorrt_llm -> postprocess + assert len(models) == 5 + elif backend == "vllm": + assert len(models) == 1 + else: + raise Exception(f"Unexpected backend {backend=}") + + def test_openai_client_completion( + self, client: openai.OpenAI, model: str, prompt: str + ): + completion = client.completions.create( + prompt=prompt, + model=model, + ) + + print(f"Completion results: {completion}") + assert completion.choices[0].text + assert completion.choices[0].finish_reason == "stop" + + def test_openai_client_chat_completion( + self, client: openai.OpenAI, model: str, messages: List[dict] + ): + chat_completion = client.chat.completions.create( + messages=messages, + model=model, + ) + + print(f"Chat completion results: {chat_completion}") + assert chat_completion.choices[0].message.content + assert chat_completion.choices[0].finish_reason == "stop" + + @pytest.mark.parametrize("echo", [False, True]) + def test_openai_client_completion_echo( + self, client: openai.OpenAI, echo: bool, backend: str, model: str, prompt: str + ): + if backend == "tensorrtllm": + pytest.skip( + reason="TRT-LLM backend currently only supports setting this parameter at model load time", + ) + + completion = client.completions.create(prompt=prompt, model=model, echo=echo) + + print(f"Completion results: {completion}") + response = completion.choices[0].text + if echo: + assert prompt in response + else: + assert prompt not in response + + @pytest.mark.skip(reason="Not Implemented Yet") + def test_openai_client_function_calling(self): + pass + + +class TestAsyncOpenAIClient: + @pytest.fixture(scope="class") + def client(self, server): + return server.get_async_client() + + @pytest.mark.asyncio + async def test_openai_client_models(self, client: openai.AsyncOpenAI, backend: str): + async_models = await client.models.list() + models = [model async for model in async_models] + print(f"Models: {models}") + if backend == "tensorrtllm": + # ensemble or tensorrt_llm_bls + # preprocess -> tensorrt_llm -> postprocess + assert len(models) == 5 + elif backend == "vllm": + assert len(models) == 1 + else: + raise Exception(f"Unexpected backend {backend=}") + + @pytest.mark.asyncio + async def test_openai_client_completion( + self, client: openai.AsyncOpenAI, model: str, prompt: str + ): + completion = await client.completions.create( + prompt=prompt, + model=model, + ) + + print(f"Completion results: {completion}") + assert completion.choices[0].text + assert completion.choices[0].finish_reason == "stop" + + @pytest.mark.asyncio + async def test_openai_client_chat_completion( + self, client: openai.AsyncOpenAI, model: str, messages: List[dict] + ): + chat_completion = await client.chat.completions.create( + messages=messages, + model=model, + ) + + assert chat_completion.choices[0].message.content + assert chat_completion.choices[0].finish_reason == "stop" + print(f"Chat completion results: {chat_completion}") + + # TODO: Add this test + @pytest.mark.skip(reason="Not Implemented Yet") + @pytest.mark.asyncio + async def test_completion_streaming(self): + pass + + @pytest.mark.asyncio + async def test_chat_streaming( + self, client: openai.AsyncOpenAI, model: str, messages: List[dict] + ): + # test single completion + chat_completion = await client.chat.completions.create( + model=model, + messages=messages, + max_tokens=10, + temperature=0.0, + stream=False, + ) + output = chat_completion.choices[0].message.content + stop_reason = chat_completion.choices[0].finish_reason + + # test streaming + stream = await client.chat.completions.create( + model=model, + messages=messages, + max_tokens=10, + temperature=0.0, + stream=True, + ) + chunks = [] + finish_reason_count = 0 + async for chunk in stream: + delta = chunk.choices[0].delta + if delta.role: + assert delta.role == "assistant" + if delta.content: + chunks.append(delta.content) + if chunk.choices[0].finish_reason is not None: + finish_reason_count += 1 + # finish reason should only return in last block + assert finish_reason_count == 1 + assert chunk.choices[0].finish_reason == stop_reason + assert "".join(chunks) == output + + @pytest.mark.skip(reason="Not Implemented Yet") + @pytest.mark.asyncio + async def test_openai_client_function_calling(self): + pass diff --git a/python/openai/openai/tests/utils.py b/python/openai/openai/tests/utils.py new file mode 100644 index 0000000000..d03368663a --- /dev/null +++ b/python/openai/openai/tests/utils.py @@ -0,0 +1,93 @@ +import os +import subprocess +import sys +import time +from pathlib import Path +from typing import Dict, List, Optional + +import openai +import requests +from src.api_server import init_app + + +def setup_fastapi_app(tokenizer: str, model_repository: str): + os.environ["TOKENIZER"] = tokenizer + os.environ["TRITON_MODEL_REPOSITORY"] = model_repository + app = init_app() + return app + + +# Heavily inspired by vLLM's test infrastructure +class OpenAIServer: + API_KEY = "EMPTY" # Triton's OpenAI server does not need API key + START_TIMEOUT = 120 # wait for server to start for up to 120 seconds + + def __init__( + self, + cli_args: List[str], + *, + env_dict: Optional[Dict[str, str]] = None, + ) -> None: + self.host = "localhost" + self.port = 8000 + + env = os.environ.copy() + if env_dict is not None: + env.update(env_dict) + + this_dir = Path(__file__).resolve().parent + script_path = this_dir / ".." / ".." / "main.py" + self.proc = subprocess.Popen( + ["python3", script_path] + cli_args, + env=env, + stdout=sys.stdout, + stderr=sys.stderr, + ) + # Wait until health endpoint is responsive + self._wait_for_server(url=self.url_for("health"), timeout=self.START_TIMEOUT) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.proc.terminate() + try: + wait_secs = 30 + self.proc.wait(wait_secs) + except subprocess.TimeoutExpired: + # force kill if needed + self.proc.kill() + + def _wait_for_server(self, *, url: str, timeout: float): + start = time.time() + while True: + try: + if requests.get(url).status_code == 200: + break + except Exception as err: + result = self.proc.poll() + if result is not None and result != 0: + raise RuntimeError("Server exited unexpectedly.") from err + + time.sleep(0.5) + if time.time() - start > timeout: + raise RuntimeError("Server failed to start in time.") from err + + @property + def url_root(self) -> str: + return f"http://{self.host}:{self.port}" + + def url_for(self, *parts: str) -> str: + return self.url_root + "/" + "/".join(parts) + + def get_client(self): + return openai.OpenAI( + base_url=self.url_for("v1"), + api_key=self.API_KEY, + ) + + def get_async_client(self): + return openai.AsyncOpenAI( + base_url=self.url_for("v1"), + api_key=self.API_KEY, + ) diff --git a/python/openai/openai/tests/vllm_models/llama-3.1-8b-instruct/1/model.json b/python/openai/openai/tests/vllm_models/llama-3.1-8b-instruct/1/model.json new file mode 100644 index 0000000000..00f18b88bd --- /dev/null +++ b/python/openai/openai/tests/vllm_models/llama-3.1-8b-instruct/1/model.json @@ -0,0 +1 @@ +{"model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "disable_log_requests": true, "gpu_memory_utilization": 0.85} \ No newline at end of file diff --git a/python/openai/openai/tests/vllm_models/llama-3.1-8b-instruct/config.pbtxt b/python/openai/openai/tests/vllm_models/llama-3.1-8b-instruct/config.pbtxt new file mode 100644 index 0000000000..4ad6534943 --- /dev/null +++ b/python/openai/openai/tests/vllm_models/llama-3.1-8b-instruct/config.pbtxt @@ -0,0 +1,2 @@ +backend: "vllm" +instance_group [{kind: KIND_MODEL}] \ No newline at end of file From 363b40ea4faf259510ac642e69cc4a1812bf7884 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Mon, 19 Aug 2024 16:06:32 -0700 Subject: [PATCH 27/80] Move openai code to server/python folder --- qa/L0_openai/CONTRIBUTING.md | 34 - qa/L0_openai/examples/chat.sh | 9 - qa/L0_openai/examples/genai_perf.sh | 12 - qa/L0_openai/examples/models.sh | 3 - qa/L0_openai/examples/openai_client.py | 28 - qa/L0_openai/examples/streaming_chat.sh | 19 - qa/L0_openai/openai/Dockerfile.trtllm | 4 - qa/L0_openai/openai/Dockerfile.vllm | 6 - qa/L0_openai/openai/README.md | 128 --- qa/L0_openai/openai/main.py | 67 -- qa/L0_openai/openai/src/__init__.py | 0 qa/L0_openai/openai/src/api_server.py | 74 -- qa/L0_openai/openai/src/routers/__init__.py | 0 .../openai/src/routers/chat_completions.py | 178 ---- .../openai/src/routers/completions.py | 125 --- qa/L0_openai/openai/src/routers/models.py | 59 -- .../openai/src/routers/observability.py | 24 - qa/L0_openai/openai/src/schemas/__init__.py | 0 qa/L0_openai/openai/src/schemas/openai.py | 871 ------------------ qa/L0_openai/openai/src/tests/__init__.py | 0 qa/L0_openai/openai/src/tests/conftest.py | 75 -- .../tests/tensorrtllm_models/ensemble/1/.tmp | 0 .../tensorrtllm_models/ensemble/config.pbtxt | 470 ---------- .../postprocessing/1/model.py | 246 ----- .../postprocessing/config.pbtxt | 113 --- .../preprocessing/1/model.py | 418 --------- .../preprocessing/config.pbtxt | 156 ---- .../tensorrt_llm/1/.gitkeep | 0 .../tensorrt_llm/1/model.py | 797 ---------------- .../tensorrt_llm/config.pbtxt | 542 ----------- .../tensorrt_llm_bls/1/lib/decode.py | 347 ------- .../tensorrt_llm_bls/1/lib/triton_decoder.py | 478 ---------- .../tensorrt_llm_bls/1/model.py | 137 --- .../tensorrt_llm_bls/config.pbtxt | 252 ----- .../openai/src/tests/test_chat_completions.py | 447 --------- .../openai/src/tests/test_completions.py | 321 ------- .../src/tests/test_models/mock_llm/1/model.py | 108 --- .../tests/test_models/mock_llm/config.pbtxt | 60 -- .../openai/src/tests/test_observability.py | 71 -- .../openai/src/tests/test_openai_client.py | 163 ---- qa/L0_openai/openai/src/tests/utils.py | 93 -- .../llama-3.1-8b-instruct/1/model.json | 1 - .../llama-3.1-8b-instruct/config.pbtxt | 2 - qa/L0_openai/openai/src/utils/__init__.py | 0 qa/L0_openai/openai/src/utils/tokenizer.py | 77 -- qa/L0_openai/openai/src/utils/triton.py | 219 ----- 46 files changed, 7234 deletions(-) delete mode 100644 qa/L0_openai/CONTRIBUTING.md delete mode 100755 qa/L0_openai/examples/chat.sh delete mode 100755 qa/L0_openai/examples/genai_perf.sh delete mode 100755 qa/L0_openai/examples/models.sh delete mode 100755 qa/L0_openai/examples/openai_client.py delete mode 100755 qa/L0_openai/examples/streaming_chat.sh delete mode 100644 qa/L0_openai/openai/Dockerfile.trtllm delete mode 100644 qa/L0_openai/openai/Dockerfile.vllm delete mode 100644 qa/L0_openai/openai/README.md delete mode 100755 qa/L0_openai/openai/main.py delete mode 100644 qa/L0_openai/openai/src/__init__.py delete mode 100644 qa/L0_openai/openai/src/api_server.py delete mode 100644 qa/L0_openai/openai/src/routers/__init__.py delete mode 100644 qa/L0_openai/openai/src/routers/chat_completions.py delete mode 100644 qa/L0_openai/openai/src/routers/completions.py delete mode 100644 qa/L0_openai/openai/src/routers/models.py delete mode 100644 qa/L0_openai/openai/src/routers/observability.py delete mode 100644 qa/L0_openai/openai/src/schemas/__init__.py delete mode 100644 qa/L0_openai/openai/src/schemas/openai.py delete mode 100644 qa/L0_openai/openai/src/tests/__init__.py delete mode 100644 qa/L0_openai/openai/src/tests/conftest.py delete mode 100644 qa/L0_openai/openai/src/tests/tensorrtllm_models/ensemble/1/.tmp delete mode 100644 qa/L0_openai/openai/src/tests/tensorrtllm_models/ensemble/config.pbtxt delete mode 100644 qa/L0_openai/openai/src/tests/tensorrtllm_models/postprocessing/1/model.py delete mode 100644 qa/L0_openai/openai/src/tests/tensorrtllm_models/postprocessing/config.pbtxt delete mode 100644 qa/L0_openai/openai/src/tests/tensorrtllm_models/preprocessing/1/model.py delete mode 100644 qa/L0_openai/openai/src/tests/tensorrtllm_models/preprocessing/config.pbtxt delete mode 100644 qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm/1/.gitkeep delete mode 100644 qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm/1/model.py delete mode 100644 qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm/config.pbtxt delete mode 100644 qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/decode.py delete mode 100644 qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/triton_decoder.py delete mode 100644 qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/model.py delete mode 100644 qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm_bls/config.pbtxt delete mode 100644 qa/L0_openai/openai/src/tests/test_chat_completions.py delete mode 100644 qa/L0_openai/openai/src/tests/test_completions.py delete mode 100644 qa/L0_openai/openai/src/tests/test_models/mock_llm/1/model.py delete mode 100644 qa/L0_openai/openai/src/tests/test_models/mock_llm/config.pbtxt delete mode 100644 qa/L0_openai/openai/src/tests/test_observability.py delete mode 100644 qa/L0_openai/openai/src/tests/test_openai_client.py delete mode 100644 qa/L0_openai/openai/src/tests/utils.py delete mode 100644 qa/L0_openai/openai/src/tests/vllm_models/llama-3.1-8b-instruct/1/model.json delete mode 100644 qa/L0_openai/openai/src/tests/vllm_models/llama-3.1-8b-instruct/config.pbtxt delete mode 100644 qa/L0_openai/openai/src/utils/__init__.py delete mode 100644 qa/L0_openai/openai/src/utils/tokenizer.py delete mode 100644 qa/L0_openai/openai/src/utils/triton.py diff --git a/qa/L0_openai/CONTRIBUTING.md b/qa/L0_openai/CONTRIBUTING.md deleted file mode 100644 index 8e758a7956..0000000000 --- a/qa/L0_openai/CONTRIBUTING.md +++ /dev/null @@ -1,34 +0,0 @@ -# Triton Inference Server OpenAI Example - -## Development - -For simplicity, a `Dockerfile` containing the necessary -dependencies is included, which can be modified and built -for your needs. - -``` -docker build -t fastapi_triton . -# TODO: minimal args -docker run ... fastapi_triton -# TODO: cd to location -fastapi dev -``` - -## Testing - -The testing for this example is all done through `pytest`, which -is well integrated with `FastAPI`. - -``` -cd src/tests -pytest -``` - -## Adding New Routes - -First define your own router in `src/routers`, referring -to the existing routers as examples. - -Then, add your router to the application in `api_server.py` -with `app.include_router(my_router)`. - diff --git a/qa/L0_openai/examples/chat.sh b/qa/L0_openai/examples/chat.sh deleted file mode 100755 index 5a7bb9b656..0000000000 --- a/qa/L0_openai/examples/chat.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash -# or "tensorrt_llm_bls" for TRT-LLM -MODEL=${1:-"llama-3.1-8b-instruct"} -curl -s http://localhost:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "'${MODEL}'", - "messages": [{"role": "user", "content": "Say this is a test!"}] - }' | jq diff --git a/qa/L0_openai/examples/genai_perf.sh b/qa/L0_openai/examples/genai_perf.sh deleted file mode 100755 index b1e3716fb6..0000000000 --- a/qa/L0_openai/examples/genai_perf.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash -MODEL=${1:-"llama-3.1-8b-instruct"} -genai-perf \ - --model ${MODEL} \ - --tokenizer meta-llama/Meta-Llama-3-8B-Instruct \ - --service-kind openai \ - --endpoint-type chat \ - --synthetic-input-tokens-mean 256 \ - --synthetic-input-tokens-stddev 0 \ - --output-tokens-mean 256 \ - --output-tokens-stddev 0 \ - --streaming diff --git a/qa/L0_openai/examples/models.sh b/qa/L0_openai/examples/models.sh deleted file mode 100755 index 944fbe07af..0000000000 --- a/qa/L0_openai/examples/models.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash -curl -s http://localhost:8000/v1/models \ - -H "Content-Type: application/json" | jq diff --git a/qa/L0_openai/examples/openai_client.py b/qa/L0_openai/examples/openai_client.py deleted file mode 100755 index 913aac44a0..0000000000 --- a/qa/L0_openai/examples/openai_client.py +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env python3 -import sys - -from openai import OpenAI - -# or "tensorrt_llm_bls" for TRT-LLM -model = "llama-3.1-8b-instruct" -if len(sys.argv) > 1: - model = sys.argv[1] - -client = OpenAI( - base_url="http://localhost:8000/v1", - api_key="EMPTY", -) - -completion = client.chat.completions.create( - model=model, - messages=[ - { - "role": "system", - "content": "You are a helpful assistant.", - }, - {"role": "user", "content": "What are LLMs?"}, - ], - max_tokens=256, -) - -print(completion.choices[0].message.content) diff --git a/qa/L0_openai/examples/streaming_chat.sh b/qa/L0_openai/examples/streaming_chat.sh deleted file mode 100755 index 6ace5434d2..0000000000 --- a/qa/L0_openai/examples/streaming_chat.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -# or "tensorrt_llm_bls" for TRT-LLM -MODEL=${1:-"llama-3.1-8b-instruct"} -curl -s -N http://localhost:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "'${MODEL}'", - "messages": [ - { - "role": "system", - "content": "You are a helpful assistant." - }, - { - "role": "user", - "content": "Hello!" - } - ], - "stream": true - }' diff --git a/qa/L0_openai/openai/Dockerfile.trtllm b/qa/L0_openai/openai/Dockerfile.trtllm deleted file mode 100644 index 1128cc4355..0000000000 --- a/qa/L0_openai/openai/Dockerfile.trtllm +++ /dev/null @@ -1,4 +0,0 @@ -ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 -FROM ${BASE_IMAGE} -RUN pip install /opt/tritonserver/python/*.whl -RUN pip install "fastapi==0.111.1" "pytest==8.1.1" "openai==1.40.6" "pytest-asyncio==0.23.8" diff --git a/qa/L0_openai/openai/Dockerfile.vllm b/qa/L0_openai/openai/Dockerfile.vllm deleted file mode 100644 index dbb8a5f63d..0000000000 --- a/qa/L0_openai/openai/Dockerfile.vllm +++ /dev/null @@ -1,6 +0,0 @@ -ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.07-vllm-python-py3 -FROM ${BASE_IMAGE} -RUN pip install /opt/tritonserver/python/*.whl -# NOTE: Newer vllm version upgrade to support Llama3.1 in 24.07 container. -# This should be unnecessary in 24.08 container. -RUN pip install "fastapi==0.111.1" "pytest==8.1.1" "openai==1.40.6" "pytest-asyncio==0.23.8" "vllm==0.5.3.post1" diff --git a/qa/L0_openai/openai/README.md b/qa/L0_openai/openai/README.md deleted file mode 100644 index 8bef9fd1d4..0000000000 --- a/qa/L0_openai/openai/README.md +++ /dev/null @@ -1,128 +0,0 @@ -# OpenAI-Compatible Frontend for Triton Inference Server - -## Pre-requisites - -1. Docker + NVIDIA Container Runtime -2. A correctly configured `HF_TOKEN` for access to HuggingFace models. - - The current examples and testing primarily use the - [`meta-llama/Meta-Llama-3-8B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) - model, but you can manually bring your own models and adjust accordingly. - -## VLLM - -1. Build and launch the container: -```bash -docker build -t tritonserver-openai-vllm -f Dockerfile.vllm . -# NOTE: The volume mount is flexible as long as you can access -# all the source files within the container. -docker run -it --net=host --gpus all --rm \ - -v ${PWD}:/workspace \ - -w /workspace \ - tritonserver-openai-vllm -``` - -2. Launch the OpenAI server: -```bash -# NOTE: Adjust the --tokenizer based on the model being used -python3 main.py --model-repository src/tests/vllm_models/ --tokenizer meta-llama/Meta-Llama-3.1-8B-Instruct -``` - -3. Send a `/chat/completions` request: -```bash -MODEL="llama-3.1-8b-instruct" -curl -s http://localhost:8000/v1/chat/completions -H 'Content-Type: application/json' -d '{ - "model": "'${MODEL}'", - "messages": [{"role": "user", "content": "Say this is a test!"}] -}' -``` - -4. Send a `/completions` request: -```bash -MODEL="llama-3.1-8b-instruct" -curl -s http://localhost:8000/v1/completions -H 'Content-Type: application/json' -d '{ - "model": "'${MODEL}'", - "prompt": "Machine learning is" -}' -``` - -5. Benchmark with `genai-perf`: -```bash -MODEL="llama-3.1-8b-instruct" -TOKENIZER="meta-llama/Meta-Llama-3-8B-Instruct" -genai-perf \ - --model ${MODEL} \ - --tokenizer ${TOKENIZER} \ - --service-kind openai \ - --endpoint-type chat \ - --synthetic-input-tokens-mean 256 \ - --synthetic-input-tokens-stddev 0 \ - --output-tokens-mean 256 \ - --output-tokens-stddev 0 \ - --streaming -``` - -6. Use an OpenAI client: -```python -from openai import OpenAI - -client = OpenAI( - base_url="http://localhost:8000/v1", - api_key="EMPTY", -) - -model = "llama-3.1-8b-instruct" -completion = client.chat.completions.create( - model=model, - messages=[ - { - "role": "system", - "content": "You are a helpful assistant.", - }, - {"role": "user", "content": "What are LLMs?"}, - ], - max_tokens=256, -) - -print(completion.choices[0].message.content) -``` - -7. Run tests (NOTE: The server should not be running, the tests will handle starting/stopping the server as necessary): -``` -cd src/tests -pytest -v -``` - -8. For other examples, see the `examples/` folder. - -## TensorRT-LLM - -0. `[TODO]` Prepare your model repository for a TensorRT-LLM model, build the engine, etc. - -1. Build and launch the container: -``` -docker build -t tritonserver-openai-tensorrtllm -f Dockerfile.tensorrtllm . -# NOTE: The volume mount is flexible as long as you can access -# all the source files within the container. -docker run -it --net=host --gpus all --rm \ - -v ${PWD}:/workspace \ - -w /workspace \ - tritonserver-openai-tensorrtllm -``` - -2. Launch the OpenAI server: -``` -# NOTE: Adjust the --tokenizer based on the model being used -python3 main.py --model-repository src/tests/tensorrt_llm_models/ --tokenizer meta-llama/Meta-Llama-3.1-8B-Instruct -``` - -3. Send requests: -``` -MODEL="tensorrt_llm_bls" -curl -s http://localhost:8000/v1/chat/completions -H 'Content-Type: application/json' -d '{ - "model": "'${MODEL}'", - "messages": [{"role": "user", "content": "Say this is a test!"}] -}' -``` - -The other examples should be the same as vLLM, except that you should set `MODEL="tensorrt_llm_bls"`, -everywhere applicable as seen in the example request above. diff --git a/qa/L0_openai/openai/main.py b/qa/L0_openai/openai/main.py deleted file mode 100755 index 4f6f11a9f9..0000000000 --- a/qa/L0_openai/openai/main.py +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import os - -import uvicorn -from src.api_server import init_app - - -def parse_args(): - parser = argparse.ArgumentParser( - description="Triton OpenAI Compatible RESTful API server." - ) - # Uvicorn - uvicorn_group = parser.add_argument_group("Uvicorn") - uvicorn_group.add_argument("--host", type=str, default=None, help="host name") - uvicorn_group.add_argument("--port", type=int, default=8000, help="port number") - uvicorn_group.add_argument( - "--uvicorn-log-level", - type=str, - default="info", - choices=["debug", "info", "warning", "error", "critical", "trace"], - help="log level for uvicorn", - ) - - # Triton - triton_group = parser.add_argument_group("Triton Inference Server") - triton_group.add_argument( - "--tritonserver-log-level", - type=int, - default=0, - help="The tritonserver log verbosity level", - ) - triton_group.add_argument( - "--model-repository", - type=str, - default=None, - help="Path to the Triton model repository holding the models to be served", - ) - triton_group.add_argument( - "--tokenizer", - type=str, - default=None, - help="HuggingFace ID of the Tokenizer to use for chat templates", - ) - - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_args() - # NOTE: Think about other ways to pass triton args to fastapi app, - # but use env vars for simplicity for now. - if args.model_repository: - os.environ["TRITON_MODEL_REPOSITORY"] = args.model_repository - if args.tokenizer: - os.environ["TOKENIZER"] = args.tokenizer - - os.environ["TRITON_LOG_VERBOSE_LEVEL"] = str(args.tritonserver_log_level) - - app = init_app() - uvicorn.run( - app, - host=args.host, - port=args.port, - log_level=args.uvicorn_log_level, - timeout_keep_alive=5, - ) diff --git a/qa/L0_openai/openai/src/__init__.py b/qa/L0_openai/openai/src/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/qa/L0_openai/openai/src/api_server.py b/qa/L0_openai/openai/src/api_server.py deleted file mode 100644 index 1b7543a4a0..0000000000 --- a/qa/L0_openai/openai/src/api_server.py +++ /dev/null @@ -1,74 +0,0 @@ -from __future__ import annotations - -from contextlib import asynccontextmanager - -import tritonserver -from fastapi import FastAPI -from fastapi.middleware.cors import CORSMiddleware -from src.routers import chat_completions, completions, models, observability -from src.utils.triton import init_tritonserver - - -def add_cors_middleware(app: FastAPI): - # Allow API calls through browser /docs route for debug purposes - origins = [ - "http://localhost", - ] - - print(f"[WARNING] Adding CORS for the following origins: {origins}") - app.add_middleware( - CORSMiddleware, - allow_origins=origins, - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], - ) - - -@asynccontextmanager -async def lifespan(app: FastAPI): - print("Starting FastAPI app lifespan...") - # Start the tritonserver on FastAPI app startup - server, model_metadatas = init_tritonserver() - app.server = server - app.models = {metadata.name: metadata for metadata in model_metadatas} - - yield - - # Cleanup the tritonserver on FastAPI app shutdown - print("Shutting down FastAPI app lifespan...") - if app.server: - print("Shutting down Triton Inference Server...") - try: - app.server.stop() - # Log error, but don't raise on shutdown - except tritonserver.InternalError as e: - print(e) - - -def init_app(): - app = FastAPI( - title="OpenAI API", - description="The OpenAI REST API. Please see https://platform.openai.com/docs/api-reference for more details.", - version="2.0.0", - termsOfService="https://openai.com/policies/terms-of-use", - contact={"name": "OpenAI Support", "url": "https://help.openai.com/"}, - license={ - "name": "MIT", - "url": "https://github.com/openai/openai-openapi/blob/master/LICENSE", - }, - lifespan=lifespan, - ) - - app.include_router(observability.router) - app.include_router(models.router) - app.include_router(completions.router) - app.include_router(chat_completions.router) - - # NOTE: For debugging purposes, should generally be restricted or removed - add_cors_middleware(app) - - # TODO: Add common logger and use logger.debug in place of current print - # statements for debugging purposes. - - return app diff --git a/qa/L0_openai/openai/src/routers/__init__.py b/qa/L0_openai/openai/src/routers/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/qa/L0_openai/openai/src/routers/chat_completions.py b/qa/L0_openai/openai/src/routers/chat_completions.py deleted file mode 100644 index a970574e71..0000000000 --- a/qa/L0_openai/openai/src/routers/chat_completions.py +++ /dev/null @@ -1,178 +0,0 @@ -import time -import uuid - -from fastapi import APIRouter, HTTPException, Request -from fastapi.responses import StreamingResponse -from src.schemas.openai import ( - ChatCompletionChoice, - ChatCompletionFinishReason, - ChatCompletionResponseMessage, - ChatCompletionStreamingResponseChoice, - ChatCompletionStreamResponseDelta, - CreateChatCompletionRequest, - CreateChatCompletionResponse, - CreateChatCompletionStreamResponse, - ObjectType, -) -from src.utils.triton import get_output, validate_triton_responses - -router = APIRouter() - - -def get_first_response_role(conversation, add_generation_prompt, default_role): - if add_generation_prompt: - return default_role - - return conversation[-1]["role"] - - -def streaming_chat_completion_response(request_id, created, model, role, responses): - # first chunk - choice = ChatCompletionStreamingResponseChoice( - index=0, - delta=ChatCompletionStreamResponseDelta( - role=role, content="", function_call=None - ), - logprobs=None, - finish_reason=None, - ) - chunk = CreateChatCompletionStreamResponse( - id=request_id, - choices=[choice], - created=created, - model=model, - system_fingerprint=None, - object=ObjectType.chat_completion_chunk, - ) - yield f"data: {chunk.json(exclude_unset=True)}\n\n" - - for response in responses: - text = get_output(response) - - choice = ChatCompletionStreamingResponseChoice( - index=0, - delta=ChatCompletionStreamResponseDelta( - role=None, content=text, function_call=None - ), - logprobs=None, - finish_reason=ChatCompletionFinishReason.stop if response.final else None, - ) - - chunk = CreateChatCompletionStreamResponse( - id=request_id, - choices=[choice], - created=created, - model=model, - system_fingerprint=None, - object=ObjectType.chat_completion_chunk, - ) - - yield f"data: {chunk.json(exclude_unset=True)}\n\n" - - yield "data: [DONE]\n\n" - - -@router.post( - "/v1/chat/completions", response_model=CreateChatCompletionResponse, tags=["Chat"] -) -def create_chat_completion( - request: CreateChatCompletionRequest, - raw_request: Request, -) -> CreateChatCompletionResponse | StreamingResponse: - """ - Creates a model response for the given chat conversation. - """ - - # TODO: Cleanup - print(f"[DEBUG] Available model metadata: {raw_request.app.models.keys()=}") - print(f"[DEBUG] Fetching model metadata for {request.model=}") - - model_metadatas = raw_request.app.models - if not model_metadatas: - raise HTTPException(status_code=400, detail="No known models") - - metadata = model_metadatas.get(request.model) - if not metadata: - raise HTTPException(status_code=400, detail=f"Unknown model: {request.model}") - - if not metadata.request_convert_fn: - raise HTTPException( - status_code=400, detail=f"Unknown request format for model: {request.model}" - ) - - if not metadata.tokenizer: - raise HTTPException(status_code=400, detail="Unknown tokenizer") - - if not metadata.backend: - raise HTTPException(status_code=400, detail="Unknown backend") - - triton_model = raw_request.app.server.model(request.model) - if request.model != triton_model.name: - raise HTTPException( - status_code=400, - detail=f"Mismatched model name: {request.model} != {triton_model.name}", - ) - - if request.n and request.n > 1: - raise HTTPException(status_code=400, detail="Only single choice is supported") - - if request.logit_bias is not None or request.logprobs: - raise HTTPException( - status_code=400, detail="logit bias and log probs not supported" - ) - - conversation = [ - {"role": str(message.role), "content": str(message.content)} - for message in request.messages - ] - - # NOTE: This behavior should be tested further - # TODO: Do these need to be exposed to the user? - add_generation_prompt = True - default_role = "assistant" - role = get_first_response_role(conversation, add_generation_prompt, default_role) - - prompt = metadata.tokenizer.apply_chat_template( - conversation=conversation, - tokenize=False, - add_generation_prompt=add_generation_prompt, - ) - - request_id = f"cmpl-{uuid.uuid1()}" - created = int(time.time()) - - responses = triton_model.infer( - metadata.request_convert_fn(triton_model, prompt, request) - ) - - if request.stream: - return StreamingResponse( - streaming_chat_completion_response( - request_id, created, request.model, role, responses - ), - media_type="text/event-stream", - ) - - # Response validation with decoupled models in mind - responses = list(responses) - validate_triton_responses(responses) - response = responses[0] - text = get_output(response) - - return CreateChatCompletionResponse( - id=request_id, - choices=[ - ChatCompletionChoice( - index=0, - message=ChatCompletionResponseMessage( - content=text, role=default_role, function_call=None - ), - logprobs=None, - finish_reason=ChatCompletionFinishReason.stop, - ) - ], - created=created, - model=request.model, - system_fingerprint=None, - object=ObjectType.chat_completion, - ) diff --git a/qa/L0_openai/openai/src/routers/completions.py b/qa/L0_openai/openai/src/routers/completions.py deleted file mode 100644 index 5d1e9b12fa..0000000000 --- a/qa/L0_openai/openai/src/routers/completions.py +++ /dev/null @@ -1,125 +0,0 @@ -import time -import uuid - -from fastapi import APIRouter, HTTPException, Request -from fastapi.responses import StreamingResponse -from src.schemas.openai import ( - Choice, - CreateCompletionRequest, - CreateCompletionResponse, - FinishReason, - ObjectType, -) -from src.utils.triton import get_output, validate_triton_responses - -router = APIRouter() - - -def streaming_completion_response(request_id, created, model, responses): - for response in responses: - text = get_output(response) - - choice = Choice( - finish_reason=FinishReason.stop if response.final else None, - index=0, - logprobs=None, - text=text, - ) - response = CreateCompletionResponse( - id=request_id, - choices=[choice], - system_fingerprint=None, - object=ObjectType.text_completion, - created=created, - model=model, - ) - - yield f"data: {response.json(exclude_unset=True)}\n\n" - yield "data: [DONE]\n\n" - - -@router.post( - "/v1/completions", response_model=CreateCompletionResponse, tags=["Completions"] -) -def create_completion( - request: CreateCompletionRequest, raw_request: Request -) -> CreateCompletionResponse | StreamingResponse: - """ - Creates a completion for the provided prompt and parameters. - """ - - if not request.model: - raise Exception("Request must provide a valid 'model'") - - print(f"[DEBUG] Available model metadata: {raw_request.app.models.keys()=}") - print(f"[DEBUG] Fetching model metadata for {request.model=}") - metadata = raw_request.app.models.get(request.model) - - if not metadata: - raise HTTPException( - status_code=400, detail=f"Unknown model metadata for model: {request.model}" - ) - - if not metadata.request_convert_fn: - raise HTTPException( - status_code=400, detail=f"Unknown request format for model: {request.model}" - ) - - if request.suffix is not None: - raise HTTPException(status_code=400, detail="suffix is not currently supported") - - if request.model != metadata.name: - raise HTTPException(status_code=404, detail=f"Unknown model: {request.model}") - - if not request.prompt: - raise HTTPException(status_code=400, detail="prompt must be non-empty") - - # Currently only support single string as input - if not isinstance(request.prompt, str): - raise HTTPException( - status_code=400, detail="only single string input is supported" - ) - - if request.n and request.n > 1: - raise HTTPException(status_code=400, detail="Only single choice is supported") - - if request.logit_bias is not None or request.logprobs is not None: - raise HTTPException( - status_code=400, detail="logit bias and log probs not supported" - ) - - request_id = f"cmpl-{uuid.uuid1()}" - created = int(time.time()) - - triton_model = raw_request.app.server.model(request.model) - responses = triton_model.infer( - metadata.request_convert_fn(triton_model, request.prompt, request) - ) - if request.stream: - return StreamingResponse( - streaming_completion_response( - request_id, created, metadata.name, responses - ), - media_type="text/event-stream", - ) - - # Response validation with decoupled models in mind - responses = list(responses) - validate_triton_responses(responses) - response = responses[0] - text = get_output(response) - - choice = Choice( - finish_reason=FinishReason.stop, - index=0, - logprobs=None, - text=text, - ) - return CreateCompletionResponse( - id=request_id, - choices=[choice], - system_fingerprint=None, - object=ObjectType.text_completion, - created=created, - model=metadata.name, - ) diff --git a/qa/L0_openai/openai/src/routers/models.py b/qa/L0_openai/openai/src/routers/models.py deleted file mode 100644 index ff47000cfd..0000000000 --- a/qa/L0_openai/openai/src/routers/models.py +++ /dev/null @@ -1,59 +0,0 @@ -from fastapi import APIRouter, HTTPException, Request -from src.schemas.openai import ListModelsResponse, Model, ObjectType - -router = APIRouter() - -OWNED_BY = "Triton Inference Server" - - -@router.get("/v1/models", response_model=ListModelsResponse, tags=["Models"]) -def list_models(request: Request) -> ListModelsResponse: - """ - Lists the currently available models, and provides basic information about each one such as the owner and availability. - """ - model_metadatas = request.app.models - if not model_metadatas: - raise HTTPException(status_code=400, detail="No known models") - - model_list = [] - for model in model_metadatas: - metadata = model_metadatas[model] - if not metadata: - raise HTTPException( - status_code=400, detail=f"No metadata for model: {model}" - ) - - model_list.append( - Model( - id=metadata.name, - created=metadata.create_time, - object=ObjectType.model, - owned_by=OWNED_BY, - ), - ) - - return ListModelsResponse(object=ObjectType.list, data=model_list) - - -@router.get("/v1/models/{model_name}", response_model=Model, tags=["Models"]) -def retrieve_model(request: Request, model_name: str) -> Model: - """ - Retrieves a model instance, providing basic information about the model such as the owner and permissioning. - """ - model_metadatas = request.app.models - if not model_metadatas: - raise HTTPException(status_code=400, detail="No known models") - - model = model_metadatas.get(model_name) - if not model: - raise HTTPException(status_code=400, detail=f"Unknown model: {model_name}") - - if model_name == model.name: - return Model( - id=model.name, - created=model.create_time, - object=ObjectType.model, - owned_by=OWNED_BY, - ) - - raise HTTPException(status_code=404, detail=f"Unknown model: {model_name}") diff --git a/qa/L0_openai/openai/src/routers/observability.py b/qa/L0_openai/openai/src/routers/observability.py deleted file mode 100644 index 98d506dab5..0000000000 --- a/qa/L0_openai/openai/src/routers/observability.py +++ /dev/null @@ -1,24 +0,0 @@ -from fastapi import APIRouter, HTTPException, Request -from fastapi.responses import Response - -router = APIRouter() - - -@router.get("/metrics", tags=["Utilities"]) -def metrics(request: Request) -> str: - if not request.app.server or not request.app.server.live(): - raise HTTPException( - status_code=400, detail="Triton Inference Server is not live." - ) - - return request.app.server.metrics() - - -@router.get("/health", tags=["Utilities"]) -def health(request: Request) -> Response: - if not request.app.server or not request.app.server.live(): - raise HTTPException( - status_code=400, detail="Triton Inference Server is not live." - ) - - return Response(status_code=200) diff --git a/qa/L0_openai/openai/src/schemas/__init__.py b/qa/L0_openai/openai/src/schemas/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/qa/L0_openai/openai/src/schemas/openai.py b/qa/L0_openai/openai/src/schemas/openai.py deleted file mode 100644 index 488dfda3bb..0000000000 --- a/qa/L0_openai/openai/src/schemas/openai.py +++ /dev/null @@ -1,871 +0,0 @@ -# generated by fastapi-codegen: -# filename: api-spec/openai_trimmed.yml -# timestamp: 2024-05-05T21:52:36+00:00 - -from __future__ import annotations - -from enum import Enum -from typing import Any, Dict, List, Optional, Union - -from pydantic import AnyUrl, BaseModel, ConfigDict, Field, RootModel, confloat, conint - - -class Error(BaseModel): - code: str - message: str - param: str - type: str - - -class ErrorResponse(BaseModel): - error: Error - - -class Object(Enum): - list = "list" - - -class DeleteModelResponse(BaseModel): - id: str - deleted: bool - object: str - - -class Model1(Enum): - gpt_3_5_turbo_instruct = "gpt-3.5-turbo-instruct" - davinci_002 = "davinci-002" - babbage_002 = "babbage-002" - - -class PromptItem(RootModel): - root: List[Any] - - -class CreateCompletionRequest(BaseModel): - model: Union[str, Model1] = Field( - ..., - description="ID of the model to use. You can use the [List models](/docs/api-reference/models/list) API to see all of your available models, or see our [Model overview](/docs/models/overview) for descriptions of them.\n", - ) - prompt: Union[str, List[str], List[int], List[PromptItem]] = Field( - ..., - description="The prompt(s) to generate completions for, encoded as a string, array of strings, array of tokens, or array of token arrays.\n\nNote that <|endoftext|> is the document separator that the model sees during training, so if a prompt is not specified the model will generate as if from the beginning of a new document.\n", - ) - best_of: Optional[conint(ge=0, le=20)] = Field( - 1, - description='Generates `best_of` completions server-side and returns the "best" (the one with the highest log probability per token). Results cannot be streamed.\n\nWhen used with `n`, `best_of` controls the number of candidate completions and `n` specifies how many to return – `best_of` must be greater than `n`.\n\n**Note:** Because this parameter generates many completions, it can quickly consume your token quota. Use carefully and ensure that you have reasonable settings for `max_tokens` and `stop`.\n', - ) - echo: Optional[bool] = Field( - False, description="Echo back the prompt in addition to the completion\n" - ) - frequency_penalty: Optional[confloat(ge=-2.0, le=2.0)] = Field( - 0, - description="Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.\n\n[See more information about frequency and presence penalties.](/docs/guides/text-generation/parameter-details)\n", - ) - logit_bias: Optional[Dict[str, int]] = Field( - None, - description='Modify the likelihood of specified tokens appearing in the completion.\n\nAccepts a JSON object that maps tokens (specified by their token ID in the GPT tokenizer) to an associated bias value from -100 to 100. You can use this [tokenizer tool](/tokenizer?view=bpe) to convert text to token IDs. Mathematically, the bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model, but values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should result in a ban or exclusive selection of the relevant token.\n\nAs an example, you can pass `{"50256": -100}` to prevent the <|endoftext|> token from being generated.\n', - ) - logprobs: Optional[conint(ge=0, le=5)] = Field( - None, - description="Include the log probabilities on the `logprobs` most likely output tokens, as well the chosen tokens. For example, if `logprobs` is 5, the API will return a list of the 5 most likely tokens. The API will always return the `logprob` of the sampled token, so there may be up to `logprobs+1` elements in the response.\n\nThe maximum value for `logprobs` is 5.\n", - ) - max_tokens: Optional[conint(ge=0)] = Field( - 16, - description="The maximum number of [tokens](/tokenizer) that can be generated in the completion.\n\nThe token count of your prompt plus `max_tokens` cannot exceed the model's context length. [Example Python code](https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken) for counting tokens.\n", - examples=[16], - ) - n: Optional[conint(ge=1, le=128)] = Field( - 1, - description="How many completions to generate for each prompt.\n\n**Note:** Because this parameter generates many completions, it can quickly consume your token quota. Use carefully and ensure that you have reasonable settings for `max_tokens` and `stop`.\n", - examples=[1], - ) - presence_penalty: Optional[confloat(ge=-2.0, le=2.0)] = Field( - 0, - description="Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.\n\n[See more information about frequency and presence penalties.](/docs/guides/text-generation/parameter-details)\n", - ) - seed: Optional[conint(ge=-9223372036854775808, le=9223372036854775807)] = Field( - None, - description="If specified, our system will make a best effort to sample deterministically, such that repeated requests with the same `seed` and parameters should return the same result.\n\nDeterminism is not guaranteed, and you should refer to the `system_fingerprint` response parameter to monitor changes in the backend.\n", - ) - stop: Optional[Union[str, List[str]]] = Field( - None, - description="Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence.\n", - ) - stream: Optional[bool] = Field( - False, - description="Whether to stream back partial progress. If set, tokens will be sent as data-only [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format) as they become available, with the stream terminated by a `data: [DONE]` message. [Example Python code](https://cookbook.openai.com/examples/how_to_stream_completions).\n", - ) - suffix: Optional[str] = Field( - None, - description="The suffix that comes after a completion of inserted text.\n\nThis parameter is only supported for `gpt-3.5-turbo-instruct`.\n", - examples=["test."], - ) - temperature: Optional[confloat(ge=0.0, le=2.0)] = Field( - 1, - description="What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.\n\nWe generally recommend altering this or `top_p` but not both.\n", - examples=[1], - ) - top_p: Optional[confloat(ge=0.0, le=1.0)] = Field( - 1, - description="An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.\n\nWe generally recommend altering this or `temperature` but not both.\n", - examples=[1], - ) - user: Optional[str] = Field( - None, - description="A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse. [Learn more](/docs/guides/safety-best-practices/end-user-ids).\n", - examples=["user-1234"], - ) - - -class FinishReason(Enum): - stop = "stop" - length = "length" - content_filter = "content_filter" - - -class Logprobs(BaseModel): - text_offset: Optional[List[int]] = None - token_logprobs: Optional[List[float]] = None - tokens: Optional[List[str]] = None - top_logprobs: Optional[List[Dict[str, float]]] = None - - -class Choice(BaseModel): - finish_reason: FinishReason | None = Field( - ..., - description="The reason the model stopped generating tokens. This will be `stop` if the model hit a natural stop point or a provided stop sequence,\n`length` if the maximum number of tokens specified in the request was reached,\nor `content_filter` if content was omitted due to a flag from our content filters.\n", - ) - index: int - logprobs: Logprobs | None - text: str - - -class Object1(Enum): - text_completion = "text_completion" - - -class Type(Enum): - image_url = "image_url" - - -class Detail(Enum): - auto = "auto" - low = "low" - high = "high" - - -class ImageUrl(BaseModel): - url: AnyUrl = Field( - ..., description="Either a URL of the image or the base64 encoded image data." - ) - detail: Optional[Detail] = Field( - "auto", - description="Specifies the detail level of the image. Learn more in the [Vision guide](/docs/guides/vision/low-or-high-fidelity-image-understanding).", - ) - - -class ChatCompletionRequestMessageContentPartImage(BaseModel): - type: Type = Field(..., description="The type of the content part.") - image_url: ImageUrl - - -class Type1(Enum): - text = "text" - - -class ChatCompletionRequestMessageContentPartText(BaseModel): - type: Type1 = Field(..., description="The type of the content part.") - text: str = Field(..., description="The text content.") - - -class Role(Enum): - system = "system" - - def __str__(self): - return self.name - - -class ChatCompletionRequestSystemMessage(BaseModel): - content: str = Field(..., description="The contents of the system message.") - role: Role = Field( - ..., description="The role of the messages author, in this case `system`." - ) - name: Optional[str] = Field( - None, - description="An optional name for the participant. Provides the model information to differentiate between participants of the same role.", - ) - - -class Role1(Enum): - user = "user" - - def __str__(self): - return self.name - - -class Role2(Enum): - assistant = "assistant" - - def __str__(self): - return self.name - - -class FunctionCall(BaseModel): - arguments: str = Field( - ..., - description="The arguments to call the function with, as generated by the model in JSON format. Note that the model does not always generate valid JSON, and may hallucinate parameters not defined by your function schema. Validate the arguments in your code before calling your function.", - ) - name: str = Field(..., description="The name of the function to call.") - - -class Role3(Enum): - tool = "tool" - - def __str__(self): - return self.name - - -class ChatCompletionRequestToolMessage(BaseModel): - role: Role3 = Field( - ..., description="The role of the messages author, in this case `tool`." - ) - content: str = Field(..., description="The contents of the tool message.") - tool_call_id: str = Field( - ..., description="Tool call that this message is responding to." - ) - - -class Role4(Enum): - function = "function" - - def __str__(self): - return self.name - - -class ChatCompletionRequestFunctionMessage(BaseModel): - role: Role4 = Field( - ..., description="The role of the messages author, in this case `function`." - ) - content: str = Field(..., description="The contents of the function message.") - name: str = Field(..., description="The name of the function to call.") - - -class FunctionParameters(BaseModel): - model_config = ConfigDict(extra="allow") - # class Config: - # # TODO: Remove - # #extra = Extra.allow - # extra = "allow" - - -class ChatCompletionFunctions(BaseModel): - description: Optional[str] = Field( - None, - description="A description of what the function does, used by the model to choose when and how to call the function.", - ) - name: str = Field( - ..., - description="The name of the function to be called. Must be a-z, A-Z, 0-9, or contain underscores and dashes, with a maximum length of 64.", - ) - parameters: Optional[FunctionParameters] = None - - -class ChatCompletionFunctionCallOption(BaseModel): - name: str = Field(..., description="The name of the function to call.") - - -class Type2(Enum): - function = "function" - - -class FunctionObject(BaseModel): - description: Optional[str] = Field( - None, - description="A description of what the function does, used by the model to choose when and how to call the function.", - ) - name: str = Field( - ..., - description="The name of the function to be called. Must be a-z, A-Z, 0-9, or contain underscores and dashes, with a maximum length of 64.", - ) - parameters: Optional[FunctionParameters] = None - - -class ChatCompletionToolChoiceOption1(Enum): - none = "none" - auto = "auto" - required = "required" - - -class Function(BaseModel): - name: str = Field(..., description="The name of the function to call.") - - -class ChatCompletionNamedToolChoice(BaseModel): - type: Type2 = Field( - ..., - description="The type of the tool. Currently, only `function` is supported.", - ) - function: Function - - -class Function1(BaseModel): - name: str = Field(..., description="The name of the function to call.") - arguments: str = Field( - ..., - description="The arguments to call the function with, as generated by the model in JSON format. Note that the model does not always generate valid JSON, and may hallucinate parameters not defined by your function schema. Validate the arguments in your code before calling your function.", - ) - - -class ChatCompletionMessageToolCall(BaseModel): - id: str = Field(..., description="The ID of the tool call.") - type: Type2 = Field( - ..., - description="The type of the tool. Currently, only `function` is supported.", - ) - function: Function1 = Field(..., description="The function that the model called.") - - -class Function2(BaseModel): - name: Optional[str] = Field(None, description="The name of the function to call.") - arguments: Optional[str] = Field( - None, - description="The arguments to call the function with, as generated by the model in JSON format. Note that the model does not always generate valid JSON, and may hallucinate parameters not defined by your function schema. Validate the arguments in your code before calling your function.", - ) - - -class ChatCompletionMessageToolCallChunk(BaseModel): - index: int - id: Optional[str] = Field(None, description="The ID of the tool call.") - type: Optional[Type2] = Field( - None, - description="The type of the tool. Currently, only `function` is supported.", - ) - function: Optional[Function2] = None - - -class ChatCompletionRole(Enum): - system = "system" - user = "user" - assistant = "assistant" - tool = "tool" - function = "function" - - -class Role5(Enum): - assistant = "assistant" - - def __str__(self): - return self.name - - -class FunctionCall2(BaseModel): - arguments: Optional[str] = Field( - None, - description="The arguments to call the function with, as generated by the model in JSON format. Note that the model does not always generate valid JSON, and may hallucinate parameters not defined by your function schema. Validate the arguments in your code before calling your function.", - ) - name: Optional[str] = Field(None, description="The name of the function to call.") - - -class Role6(Enum): - system = "system" - user = "user" - assistant = "assistant" - tool = "tool" - - def __str__(self): - return self.name - - -class ChatCompletionStreamResponseDelta(BaseModel): - content: Optional[str] = Field( - None, description="The contents of the chunk message." - ) - function_call: Optional[FunctionCall2] = Field( - None, - description="Deprecated and replaced by `tool_calls`. The name and arguments of a function that should be called, as generated by the model.", - ) - tool_calls: Optional[List[ChatCompletionMessageToolCallChunk]] = None - role: Optional[str] = Field( - None, description="The role of the author of this message." - ) - - -class Model2(Enum): - gpt_4_turbo = "gpt-4-turbo" - gpt_4_turbo_2024_04_09 = "gpt-4-turbo-2024-04-09" - gpt_4_0125_preview = "gpt-4-0125-preview" - gpt_4_turbo_preview = "gpt-4-turbo-preview" - gpt_4_1106_preview = "gpt-4-1106-preview" - gpt_4_vision_preview = "gpt-4-vision-preview" - gpt_4 = "gpt-4" - gpt_4_0314 = "gpt-4-0314" - gpt_4_0613 = "gpt-4-0613" - gpt_4_32k = "gpt-4-32k" - gpt_4_32k_0314 = "gpt-4-32k-0314" - gpt_4_32k_0613 = "gpt-4-32k-0613" - gpt_3_5_turbo = "gpt-3.5-turbo" - gpt_3_5_turbo_16k = "gpt-3.5-turbo-16k" - gpt_3_5_turbo_0301 = "gpt-3.5-turbo-0301" - gpt_3_5_turbo_0613 = "gpt-3.5-turbo-0613" - gpt_3_5_turbo_1106 = "gpt-3.5-turbo-1106" - gpt_3_5_turbo_0125 = "gpt-3.5-turbo-0125" - gpt_3_5_turbo_16k_0613 = "gpt-3.5-turbo-16k-0613" - - -class Type6(Enum): - text = "text" - json_object = "json_object" - - -class ResponseFormat(BaseModel): - type: Optional[Type6] = Field( - "text", - description="Must be one of `text` or `json_object`.", - examples=["json_object"], - ) - - -class FunctionCall3(Enum): - none = "none" - auto = "auto" - - -class ChatCompletionFinishReason(Enum): - stop = "stop" - length = "length" - tool_calls = "tool_calls" - content_filter = "content_filter" - function_call = "function_call" - - -class Object2(Enum): - chat_completion = "chat.completion" - - -class FinishReason2(Enum): - stop = "stop" - length = "length" - function_call = "function_call" - content_filter = "content_filter" - - -class TopLogprob(BaseModel): - token: str = Field(..., description="The token.") - logprob: float = Field( - ..., - description="The log probability of this token, if it is within the top 20 most likely tokens. Otherwise, the value `-9999.0` is used to signify that the token is very unlikely.", - ) - bytes: List[int] = Field( - ..., - description="A list of integers representing the UTF-8 bytes representation of the token. Useful in instances where characters are represented by multiple tokens and their byte representations must be combined to generate the correct text representation. Can be `null` if there is no bytes representation for the token.", - ) - - -class ChatCompletionTokenLogprob(BaseModel): - token: str = Field(..., description="The token.") - logprob: float = Field( - ..., - description="The log probability of this token, if it is within the top 20 most likely tokens. Otherwise, the value `-9999.0` is used to signify that the token is very unlikely.", - ) - bytes: List[int] = Field( - ..., - description="A list of integers representing the UTF-8 bytes representation of the token. Useful in instances where characters are represented by multiple tokens and their byte representations must be combined to generate the correct text representation. Can be `null` if there is no bytes representation for the token.", - ) - top_logprobs: List[TopLogprob] = Field( - ..., - description="List of the most likely tokens and their log probability, at this token position. In rare cases, there may be fewer than the number of requested `top_logprobs` returned.", - ) - - -class Logprobs2(BaseModel): - content: List[ChatCompletionTokenLogprob] = Field( - ..., - description="A list of message content tokens with log probability information.", - ) - - -class ChatCompletionFinishReason(Enum): - stop = "stop" - length = "length" - tool_calls = "tool_calls" - content_filter = "content_filter" - function_call = "function_call" - - -class ChatCompletionStreamingResponseChoice(BaseModel): - delta: ChatCompletionStreamResponseDelta - logprobs: Optional[Logprobs2] = Field( - None, description="Log probability information for the choice." - ) - finish_reason: ChatCompletionFinishReason | None = Field( - ..., - description="The reason the model stopped generating tokens. This will be `stop` if the model hit a natural stop point or a provided stop sequence,\n`length` if the maximum number of tokens specified in the request was reached,\n`content_filter` if content was omitted due to a flag from our content filters,\n`tool_calls` if the model called a tool, or `function_call` (deprecated) if the model called a function.\n", - ) - index: int = Field( - ..., description="The index of the choice in the list of choices." - ) - - -class Object4(Enum): - chat_completion_chunk = "chat.completion.chunk" - - -class CreateChatCompletionStreamResponse(BaseModel): - id: str = Field( - ..., - description="A unique identifier for the chat completion. Each chunk has the same ID.", - ) - choices: List[ChatCompletionStreamingResponseChoice] = Field( - ..., - description="A list of chat completion choices. Can be more than one if `n` is greater than 1.", - ) - created: int = Field( - ..., - description="The Unix timestamp (in seconds) of when the chat completion was created. Each chunk has the same timestamp.", - ) - model: str = Field(..., description="The model to generate the completion.") - system_fingerprint: Optional[str] = Field( - None, - description="This fingerprint represents the backend configuration that the model runs with.\nCan be used in conjunction with the `seed` request parameter to understand when backend changes have been made that might impact determinism.\n", - ) - object: Object4 = Field( - ..., description="The object type, which is always `chat.completion.chunk`." - ) - - -class CreateChatCompletionImageResponse(BaseModel): - pass - - -class Object5(Enum): - model = "model" - - -class Model(BaseModel): - id: str = Field( - ..., - description="The model identifier, which can be referenced in the API endpoints.", - ) - created: int = Field( - ..., description="The Unix timestamp (in seconds) when the model was created." - ) - object: Object5 = Field( - ..., description='The object type, which is always "model".' - ) - owned_by: str = Field(..., description="The organization that owns the model.") - - -class CompletionUsage(BaseModel): - completion_tokens: int = Field( - ..., description="Number of tokens in the generated completion." - ) - prompt_tokens: int = Field(..., description="Number of tokens in the prompt.") - total_tokens: int = Field( - ..., - description="Total number of tokens used in the request (prompt + completion).", - ) - - -class Event(Enum): - error = "error" - - -class ErrorEvent(BaseModel): - event: Event - data: Error - - -class Event1(Enum): - done = "done" - - -class Data(Enum): - field_DONE_ = "[DONE]" - - -class DoneEvent(BaseModel): - event: Event1 - data: Data - - -class ListModelsResponse(BaseModel): - object: Object - data: List[Model] - - -class CreateCompletionResponse(BaseModel): - id: str = Field(..., description="A unique identifier for the completion.") - choices: List[Choice] = Field( - ..., - description="The list of completion choices the model generated for the input prompt.", - ) - created: int = Field( - ..., - description="The Unix timestamp (in seconds) of when the completion was created.", - ) - model: str = Field(..., description="The model used for completion.") - system_fingerprint: Optional[str] = Field( - None, - description="This fingerprint represents the backend configuration that the model runs with.\n\nCan be used in conjunction with the `seed` request parameter to understand when backend changes have been made that might impact determinism.\n", - ) - object: Object1 = Field( - ..., description='The object type, which is always "text_completion"' - ) - usage: Optional[CompletionUsage] = None - - -class ChatCompletionRequestMessageContentPart(RootModel): - root: Union[ - ChatCompletionRequestMessageContentPartText, - ChatCompletionRequestMessageContentPartImage, - ] - - -class ChatCompletionRequestUserMessage(BaseModel): - content: Union[str, List[ChatCompletionRequestMessageContentPart]] = Field( - ..., description="The contents of the user message.\n" - ) - role: Role1 = Field( - ..., description="The role of the messages author, in this case `user`." - ) - name: Optional[str] = Field( - None, - description="An optional name for the participant. Provides the model information to differentiate between participants of the same role.", - ) - - -class ChatCompletionTool(BaseModel): - type: Type2 = Field( - ..., - description="The type of the tool. Currently, only `function` is supported.", - ) - function: FunctionObject - - -class ChatCompletionToolChoiceOption(RootModel): - root: Union[ChatCompletionToolChoiceOption1, ChatCompletionNamedToolChoice] = Field( - ..., - description='Controls which (if any) tool is called by the model.\n`none` means the model will not call any tool and instead generates a message.\n`auto` means the model can pick between generating a message or calling one or more tools.\n`required` means the model must call one or more tools.\nSpecifying a particular tool via `{"type": "function", "function": {"name": "my_function"}}` forces the model to call that tool.\n\n`none` is the default when no tools are present. `auto` is the default if tools are present.\n', - ) - - -class ChatCompletionMessageToolCalls(RootModel): - root: List[ChatCompletionMessageToolCall] = Field( - ..., - description="The tool calls generated by the model, such as function calls.", - ) - - -class ChatCompletionResponseMessage(BaseModel): - content: str = Field(..., description="The contents of the message.") - tool_calls: Optional[ChatCompletionMessageToolCalls] = None - role: str = Field(..., description="The role of the author of this message.") - function_call: Optional[FunctionCall] = Field( - None, - description="Deprecated and replaced by `tool_calls`. The name and arguments of a function that should be called, as generated by the model.", - ) - - -class ChatCompletionChoice(BaseModel): - finish_reason: ChatCompletionFinishReason = Field( - ..., - description="The reason the model stopped generating tokens. This will be `stop` if the model hit a natural stop point or a provided stop sequence,\n`length` if the maximum number of tokens specified in the request was reached,\n`content_filter` if content was omitted due to a flag from our content filters,\n`tool_calls` if the model called a tool, or `function_call` (deprecated) if the model called a function.\n", - ) - index: int = Field( - ..., description="The index of the choice in the list of choices." - ) - message: ChatCompletionResponseMessage - logprobs: Logprobs2 | None = Field( - ..., description="Log probability information for the choice." - ) - - -class CreateChatCompletionResponse(BaseModel): - id: str = Field(..., description="A unique identifier for the chat completion.") - choices: List[ChatCompletionChoice] = Field( - ..., - description="A list of chat completion choices. Can be more than one if `n` is greater than 1.", - ) - created: int = Field( - ..., - description="The Unix timestamp (in seconds) of when the chat completion was created.", - ) - model: str = Field(..., description="The model used for the chat completion.") - system_fingerprint: Optional[str] = Field( - None, - description="This fingerprint represents the backend configuration that the model runs with.\n\nCan be used in conjunction with the `seed` request parameter to understand when backend changes have been made that might impact determinism.\n", - ) - object: Object2 = Field( - ..., description="The object type, which is always `chat.completion`." - ) - usage: Optional[CompletionUsage] = None - - -class Choice2(BaseModel): - finish_reason: FinishReason2 = Field( - ..., - description="The reason the model stopped generating tokens. This will be `stop` if the model hit a natural stop point or a provided stop sequence, `length` if the maximum number of tokens specified in the request was reached, `content_filter` if content was omitted due to a flag from our content filters, or `function_call` if the model called a function.\n", - ) - index: int = Field( - ..., description="The index of the choice in the list of choices." - ) - message: ChatCompletionResponseMessage - - -class CreateChatCompletionFunctionResponse(BaseModel): - id: str = Field(..., description="A unique identifier for the chat completion.") - choices: List[Choice2] = Field( - ..., - description="A list of chat completion choices. Can be more than one if `n` is greater than 1.", - ) - created: int = Field( - ..., - description="The Unix timestamp (in seconds) of when the chat completion was created.", - ) - model: str = Field(..., description="The model used for the chat completion.") - system_fingerprint: Optional[str] = Field( - None, - description="This fingerprint represents the backend configuration that the model runs with.\n\nCan be used in conjunction with the `seed` request parameter to understand when backend changes have been made that might impact determinism.\n", - ) - object: Object2 = Field( - ..., description="The object type, which is always `chat.completion`." - ) - usage: Optional[CompletionUsage] = None - - -class ChatCompletionRequestAssistantMessage(BaseModel): - content: Optional[str] = Field( - None, - description="The contents of the assistant message. Required unless `tool_calls` or `function_call` is specified.\n", - ) - role: Role2 = Field( - ..., description="The role of the messages author, in this case `assistant`." - ) - name: Optional[str] = Field( - None, - description="An optional name for the participant. Provides the model information to differentiate between participants of the same role.", - ) - tool_calls: Optional[ChatCompletionMessageToolCalls] = None - function_call: Optional[FunctionCall] = Field( - None, - description="Deprecated and replaced by `tool_calls`. The name and arguments of a function that should be called, as generated by the model.", - ) - - -class ChatCompletionRequestMessage(RootModel): - root: Union[ - ChatCompletionRequestSystemMessage, - ChatCompletionRequestUserMessage, - ChatCompletionRequestAssistantMessage, - ChatCompletionRequestToolMessage, - ChatCompletionRequestFunctionMessage, - ] - - @property - def role(self): - return self.root.role - - @property - def content(self): - return self.root.content - - -class CreateChatCompletionRequest(BaseModel): - messages: List[ChatCompletionRequestMessage] = Field( - ..., - description="A list of messages comprising the conversation so far. [Example Python code](https://cookbook.openai.com/examples/how_to_format_inputs_to_chatgpt_models).", - min_length=1, - ) - model: Union[str, Model2] = Field( - ..., - description="ID of the model to use. See the [model endpoint compatibility](/docs/models/model-endpoint-compatibility) table for details on which models work with the Chat API.", - examples=["gpt-4-turbo"], - ) - frequency_penalty: Optional[confloat(ge=-2.0, le=2.0)] = Field( - 0, - description="Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.\n\n[See more information about frequency and presence penalties.](/docs/guides/text-generation/parameter-details)\n", - ) - logit_bias: Optional[Dict[str, int]] = Field( - None, - description="Modify the likelihood of specified tokens appearing in the completion.\n\nAccepts a JSON object that maps tokens (specified by their token ID in the tokenizer) to an associated bias value from -100 to 100. Mathematically, the bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model, but values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should result in a ban or exclusive selection of the relevant token.\n", - ) - logprobs: Optional[bool] = Field( - False, - description="Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the `content` of `message`.", - ) - top_logprobs: Optional[conint(ge=0, le=20)] = Field( - None, - description="An integer between 0 and 20 specifying the number of most likely tokens to return at each token position, each with an associated log probability. `logprobs` must be set to `true` if this parameter is used.", - ) - max_tokens: Optional[conint(ge=0)] = Field( - 16, - description="The maximum number of [tokens](/tokenizer) that can be generated in the chat completion.\n\nThe total length of input tokens and generated tokens is limited by the model's context length. [Example Python code](https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken) for counting tokens.\n", - ) - n: Optional[conint(ge=1, le=128)] = Field( - 1, - description="How many chat completion choices to generate for each input message. Note that you will be charged based on the number of generated tokens across all of the choices. Keep `n` as `1` to minimize costs.", - examples=[1], - ) - presence_penalty: Optional[confloat(ge=-2.0, le=2.0)] = Field( - 0, - description="Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.\n\n[See more information about frequency and presence penalties.](/docs/guides/text-generation/parameter-details)\n", - ) - response_format: Optional[ResponseFormat] = Field( - None, - description='An object specifying the format that the model must output. Compatible with [GPT-4 Turbo](/docs/models/gpt-4-and-gpt-4-turbo) and all GPT-3.5 Turbo models newer than `gpt-3.5-turbo-1106`.\n\nSetting to `{ "type": "json_object" }` enables JSON mode, which guarantees the message the model generates is valid JSON.\n\n**Important:** when using JSON mode, you **must** also instruct the model to produce JSON yourself via a system or user message. Without this, the model may generate an unending stream of whitespace until the generation reaches the token limit, resulting in a long-running and seemingly "stuck" request. Also note that the message content may be partially cut off if `finish_reason="length"`, which indicates the generation exceeded `max_tokens` or the conversation exceeded the max context length.\n', - ) - seed: Optional[conint(ge=-9223372036854775808, le=9223372036854775807)] = Field( - None, - description="This feature is in Beta.\nIf specified, our system will make a best effort to sample deterministically, such that repeated requests with the same `seed` and parameters should return the same result.\nDeterminism is not guaranteed, and you should refer to the `system_fingerprint` response parameter to monitor changes in the backend.\n", - ) - stop: Optional[Union[str, List[str]]] = Field( - None, - description="Up to 4 sequences where the API will stop generating further tokens.\n", - ) - stream: Optional[bool] = Field( - False, - description="If set, partial message deltas will be sent, like in ChatGPT. Tokens will be sent as data-only [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format) as they become available, with the stream terminated by a `data: [DONE]` message. [Example Python code](https://cookbook.openai.com/examples/how_to_stream_completions).\n", - ) - temperature: Optional[confloat(ge=0.0, le=2.0)] = Field( - 0.7, - description="What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.\n\nWe generally recommend altering this or `top_p` but not both.\n", - examples=[1], - ) - top_p: Optional[confloat(ge=0.0, le=1.0)] = Field( - 1, - description="An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.\n\nWe generally recommend altering this or `temperature` but not both.\n", - examples=[1], - ) - tools: Optional[List[ChatCompletionTool]] = Field( - None, - description="A list of tools the model may call. Currently, only functions are supported as a tool. Use this to provide a list of functions the model may generate JSON inputs for. A max of 128 functions are supported.\n", - ) - tool_choice: Optional[ChatCompletionToolChoiceOption] = None - user: Optional[str] = Field( - None, - description="A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse. [Learn more](/docs/guides/safety-best-practices/end-user-ids).\n", - examples=["user-1234"], - ) - function_call: Optional[ - Union[FunctionCall3, ChatCompletionFunctionCallOption] - ] = Field( - None, - description='Deprecated in favor of `tool_choice`.\n\nControls which (if any) function is called by the model.\n`none` means the model will not call a function and instead generates a message.\n`auto` means the model can pick between generating a message or calling a function.\nSpecifying a particular function via `{"name": "my_function"}` forces the model to call that function.\n\n`none` is the default when no functions are present. `auto` is the default if functions are present.\n', - ) - functions: Optional[List[ChatCompletionFunctions]] = Field( - None, - description="Deprecated in favor of `tools`.\n\nA list of functions the model may generate JSON inputs for.\n", - max_length=128, - min_length=1, - ) - - -# Additional Aliases for Convenience - - -class ObjectType: - model = Object5.model - list = Object.list - text_completion = Object1.text_completion - chat_completion_chunk = Object4.chat_completion_chunk - chat_completion = Object2.chat_completion diff --git a/qa/L0_openai/openai/src/tests/__init__.py b/qa/L0_openai/openai/src/tests/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/qa/L0_openai/openai/src/tests/conftest.py b/qa/L0_openai/openai/src/tests/conftest.py deleted file mode 100644 index 2ff9697d59..0000000000 --- a/qa/L0_openai/openai/src/tests/conftest.py +++ /dev/null @@ -1,75 +0,0 @@ -from pathlib import Path - -import pytest -from fastapi.testclient import TestClient -from src.tests.utils import OpenAIServer, setup_fastapi_app - -### TEST ENVIRONMENT SETUP ### -TEST_BACKEND = "" -TEST_MODEL = "" -TEST_PROMPT = "What is machine learning?" -TEST_MESSAGES = [{"role": "user", "content": TEST_PROMPT}] -TEST_TOKENIZER = "meta-llama/Meta-Llama-3.1-8B-Instruct" -try: - import vllm as _ - - TEST_BACKEND = "vllm" - TEST_MODEL = "llama-3.1-8b-instruct" -except ImportError: - pass - -try: - import tensorrt_llm as _ - - TEST_BACKEND = "tensorrtllm" - TEST_MODEL = "tensorrt_llm_bls" -except ImportError: - pass - -if not TEST_BACKEND or not TEST_MODEL: - raise Exception("Unknown test environment") -### - - -# NOTE: OpenAI client requires actual server running, and won't work -# with the FastAPI TestClient. Run the server at module scope to run -# only once for all the tests below. -@pytest.fixture(scope="module") -def server(): - model_repository = Path(__file__).parent / f"{TEST_BACKEND}_models" - args = ["--model-repository", model_repository, "--tokenizer", TEST_TOKENIZER] - - with OpenAIServer(args) as openai_server: - yield openai_server - - -# NOTE: The FastAPI TestClient acts like a server and triggers the FastAPI app -# lifespan startup/shutdown, but does not actually expose the network port to interact -# with arbitrary clients - you must use the TestClient returned to interact with -# the "server" when "starting the server" via TestClient. -@pytest.fixture(scope="class") -def fastapi_client_class_scope(): - model_repository = str(Path(__file__).parent / f"{TEST_BACKEND}_models") - app = setup_fastapi_app(tokenizer=TEST_TOKENIZER, model_repository=model_repository) - with TestClient(app) as test_client: - yield test_client - - -@pytest.fixture -def model(): - return TEST_MODEL - - -@pytest.fixture -def backend(): - return TEST_BACKEND - - -@pytest.fixture -def prompt(): - return TEST_PROMPT - - -@pytest.fixture -def messages(): - return TEST_MESSAGES diff --git a/qa/L0_openai/openai/src/tests/tensorrtllm_models/ensemble/1/.tmp b/qa/L0_openai/openai/src/tests/tensorrtllm_models/ensemble/1/.tmp deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/qa/L0_openai/openai/src/tests/tensorrtllm_models/ensemble/config.pbtxt b/qa/L0_openai/openai/src/tests/tensorrtllm_models/ensemble/config.pbtxt deleted file mode 100644 index b82990446d..0000000000 --- a/qa/L0_openai/openai/src/tests/tensorrtllm_models/ensemble/config.pbtxt +++ /dev/null @@ -1,470 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -name: "ensemble" -platform: "ensemble" -max_batch_size: 64 -input [ - { - name: "text_input" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "decoder_text_input" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "max_tokens" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "bad_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "stop_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "end_id" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "pad_id" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "top_k" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "top_p" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "temperature" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "length_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "repetition_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "min_length" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "presence_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "frequency_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "random_seed" - data_type: TYPE_UINT64 - dims: [ 1 ] - optional: true - }, - { - name: "return_log_probs" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "return_context_logits" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "return_generation_logits" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "beam_width" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "stream" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "prompt_embedding_table" - data_type: TYPE_FP16 - dims: [ -1, -1 ] - optional: true - }, - { - name: "prompt_vocab_size" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "embedding_bias_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "embedding_bias_weights" - data_type: TYPE_FP32 - dims: [ -1 ] - optional: true - } -] -output [ - { - name: "text_output" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "cum_log_probs" - data_type: TYPE_FP32 - dims: [ -1 ] - }, - { - name: "output_log_probs" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - }, - { - name: "context_logits" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - }, - { - name: "generation_logits" - data_type: TYPE_FP32 - dims: [ -1, -1, -1 ] - } -] -ensemble_scheduling { - step [ - { - model_name: "preprocessing" - model_version: -1 - input_map { - key: "QUERY" - value: "text_input" - } - input_map { - key: "DECODER_QUERY" - value: "decoder_text_input" - } - input_map { - key: "REQUEST_OUTPUT_LEN" - value: "max_tokens" - } - input_map { - key: "BAD_WORDS_DICT" - value: "bad_words" - } - input_map { - key: "STOP_WORDS_DICT" - value: "stop_words" - } - input_map { - key: "EMBEDDING_BIAS_WORDS" - value: "embedding_bias_words" - } - input_map { - key: "EMBEDDING_BIAS_WEIGHTS" - value: "embedding_bias_weights" - } - input_map { - key: "END_ID" - value: "end_id" - } - input_map { - key: "PAD_ID" - value: "pad_id" - } - output_map { - key: "REQUEST_INPUT_LEN" - value: "_REQUEST_INPUT_LEN" - } - output_map { - key: "INPUT_ID" - value: "_INPUT_ID" - } - output_map { - key: "REQUEST_DECODER_INPUT_LEN" - value: "_REQUEST_DECODER_INPUT_LEN" - } - output_map { - key: "DECODER_INPUT_ID" - value: "_DECODER_INPUT_ID" - } - output_map { - key: "REQUEST_OUTPUT_LEN" - value: "_REQUEST_OUTPUT_LEN" - } - output_map { - key: "STOP_WORDS_IDS" - value: "_STOP_WORDS_IDS" - } - output_map { - key: "BAD_WORDS_IDS" - value: "_BAD_WORDS_IDS" - } - output_map { - key: "EMBEDDING_BIAS" - value: "_EMBEDDING_BIAS" - } - output_map { - key: "OUT_END_ID" - value: "_PREPROCESSOR_END_ID" - } - output_map { - key: "OUT_PAD_ID" - value: "_PREPROCESSOR_PAD_ID" - } - }, - { - model_name: "tensorrt_llm" - model_version: -1 - input_map { - key: "input_ids" - value: "_INPUT_ID" - } - input_map { - key: "decoder_input_ids" - value: "_DECODER_INPUT_ID" - } - input_map { - key: "input_lengths" - value: "_REQUEST_INPUT_LEN" - } - input_map { - key: "decoder_input_lengths" - value: "_REQUEST_DECODER_INPUT_LEN" - } - input_map { - key: "request_output_len" - value: "_REQUEST_OUTPUT_LEN" - } - input_map { - key: "end_id" - value: "_PREPROCESSOR_END_ID" - } - input_map { - key: "pad_id" - value: "_PREPROCESSOR_PAD_ID" - } - input_map { - key: "embedding_bias" - value: "_EMBEDDING_BIAS" - } - input_map { - key: "runtime_top_k" - value: "top_k" - } - input_map { - key: "runtime_top_p" - value: "top_p" - } - input_map { - key: "temperature" - value: "temperature" - } - input_map { - key: "len_penalty" - value: "length_penalty" - } - input_map { - key: "repetition_penalty" - value: "repetition_penalty" - } - input_map { - key: "min_length" - value: "min_length" - } - input_map { - key: "presence_penalty" - value: "presence_penalty" - } - input_map { - key: "frequency_penalty" - value: "frequency_penalty" - } - input_map { - key: "random_seed" - value: "random_seed" - } - input_map { - key: "return_log_probs" - value: "return_log_probs" - } - input_map { - key: "return_context_logits" - value: "return_context_logits" - } - input_map { - key: "return_generation_logits" - value: "return_generation_logits" - } - input_map { - key: "beam_width" - value: "beam_width" - } - input_map { - key: "streaming" - value: "stream" - } - input_map { - key: "prompt_embedding_table" - value: "prompt_embedding_table" - } - input_map { - key: "prompt_vocab_size" - value: "prompt_vocab_size" - } - input_map { - key: "stop_words_list" - value: "_STOP_WORDS_IDS" - } - input_map { - key: "bad_words_list" - value: "_BAD_WORDS_IDS" - } - output_map { - key: "output_ids" - value: "_TOKENS_BATCH" - } - output_map { - key: "sequence_length" - value: "_SEQUENCE_LENGTH" - }, - output_map { - key: "cum_log_probs" - value: "_CUM_LOG_PROBS" - } - output_map { - key: "output_log_probs" - value: "_OUTPUT_LOG_PROBS" - }, - output_map { - key: "context_logits" - value: "_CONTEXT_LOGITS" - }, - output_map { - key: "generation_logits" - value: "_GENERATION_LOGITS" - } - }, - { - model_name: "postprocessing" - model_version: -1 - input_map { - key: "TOKENS_BATCH" - value: "_TOKENS_BATCH" - } - input_map { - key: "CUM_LOG_PROBS" - value: "_CUM_LOG_PROBS" - } - input_map { - key: "OUTPUT_LOG_PROBS" - value: "_OUTPUT_LOG_PROBS" - } - input_map { - key: "CONTEXT_LOGITS" - value: "_CONTEXT_LOGITS" - } - input_map { - key: "GENERATION_LOGITS" - value: "_GENERATION_LOGITS" - } - input_map { - key: "SEQUENCE_LENGTH" - value: "_SEQUENCE_LENGTH" - } - output_map { - key: "OUTPUT" - value: "text_output" - } - output_map { - key: "OUT_OUTPUT_LOG_PROBS" - value: "output_log_probs" - } - output_map { - key: "OUT_CUM_LOG_PROBS" - value: "cum_log_probs" - } - output_map { - key: "OUT_CONTEXT_LOGITS" - value: "context_logits" - } - output_map { - key: "OUT_GENERATION_LOGITS" - value: "generation_logits" - } - } - ] -} diff --git a/qa/L0_openai/openai/src/tests/tensorrtllm_models/postprocessing/1/model.py b/qa/L0_openai/openai/src/tests/tensorrtllm_models/postprocessing/1/model.py deleted file mode 100644 index 0812e19b3e..0000000000 --- a/qa/L0_openai/openai/src/tests/tensorrtllm_models/postprocessing/1/model.py +++ /dev/null @@ -1,246 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import json - -import numpy as np -import triton_python_backend_utils as pb_utils -from transformers import AutoTokenizer - - -class TritonPythonModel: - """Your Python model must use the same class name. Every Python model - that is created must have "TritonPythonModel" as the class name. - """ - - def initialize(self, args): - """`initialize` is called only once when the model is being loaded. - Implementing `initialize` function is optional. This function allows - the model to initialize any state associated with this model. - Parameters - ---------- - args : dict - Both keys and values are strings. The dictionary keys and values are: - * model_config: A JSON string containing the model configuration - * model_instance_kind: A string containing model instance kind - * model_instance_device_id: A string containing model instance device ID - * model_repository: Model repository path - * model_version: Model version - * model_name: Model name - """ - # Parse model configs - model_config = json.loads(args["model_config"]) - tokenizer_dir = model_config["parameters"]["tokenizer_dir"]["string_value"] - - skip_special_tokens = model_config["parameters"].get("skip_special_tokens") - if skip_special_tokens is not None: - skip_special_tokens_str = skip_special_tokens["string_value"].lower() - if skip_special_tokens_str in [ - "true", - "false", - "1", - "0", - "t", - "f", - "y", - "n", - "yes", - "no", - ]: - self.skip_special_tokens = skip_special_tokens_str in [ - "true", - "1", - "t", - "y", - "yes", - ] - else: - print( - f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is {skip_special_tokens['string_value']}). Set it as True by default." - ) - self.skip_special_tokens = True - else: - print( - f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens'. Set it as True by default." - ) - self.skip_special_tokens = True - - self.tokenizer = AutoTokenizer.from_pretrained( - tokenizer_dir, legacy=False, padding_side="left", trust_remote_code=True - ) - if not self.tokenizer.pad_token: - self.tokenizer.pad_token = self.tokenizer.eos_token - - # Parse model output configs - output_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT") - - # Convert Triton types to numpy types - self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"]) - - def execute(self, requests): - """`execute` must be implemented in every Python model. `execute` - function receives a list of pb_utils.InferenceRequest as the only - argument. This function is called when an inference is requested - for this model. Depending on the batching configuration (e.g. Dynamic - Batching) used, `requests` may contain multiple requests. Every - Python model, must create one pb_utils.InferenceResponse for every - pb_utils.InferenceRequest in `requests`. If there is an error, you can - set the error argument when creating a pb_utils.InferenceResponse. - Parameters - ---------- - requests : list - A list of pb_utils.InferenceRequest - Returns - ------- - list - A list of pb_utils.InferenceResponse. The length of this list must - be the same as `requests` - """ - - responses = [] - - # Every Python backend must iterate over everyone of the requests - # and create a pb_utils.InferenceResponse for each of them. - for idx, request in enumerate(requests): - # Get input tensors - tokens_batch = pb_utils.get_input_tensor_by_name( - request, "TOKENS_BATCH" - ).as_numpy() - - # Get sequence length - sequence_lengths = pb_utils.get_input_tensor_by_name( - request, "SEQUENCE_LENGTH" - ).as_numpy() - - # Get cum log probs - cum_log_probs = pb_utils.get_input_tensor_by_name(request, "CUM_LOG_PROBS") - - # Get sequence length - output_log_probs = pb_utils.get_input_tensor_by_name( - request, "OUTPUT_LOG_PROBS" - ) - - # Get context logits - context_logits = pb_utils.get_input_tensor_by_name( - request, "CONTEXT_LOGITS" - ) - - # Get generation logits - generation_logits = pb_utils.get_input_tensor_by_name( - request, "GENERATION_LOGITS" - ) - - # Reshape Input - # tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]]) - # tokens_batch = tokens_batch.T - - # Postprocessing output data. - outputs = self._postprocessing(tokens_batch, sequence_lengths) - - # Create output tensors. You need pb_utils.Tensor - # objects to create pb_utils.InferenceResponse. - output_tensor = pb_utils.Tensor( - "OUTPUT", np.array(outputs).astype(self.output_dtype) - ) - - outputs = [] - outputs.append(output_tensor) - - if cum_log_probs: - out_cum_log_probs = pb_utils.Tensor( - "OUT_CUM_LOG_PROBS", cum_log_probs.as_numpy() - ) - outputs.append(out_cum_log_probs) - else: - out_cum_log_probs = pb_utils.Tensor( - "OUT_CUM_LOG_PROBS", np.array([[0.0]], dtype=np.float32) - ) - outputs.append(out_cum_log_probs) - - if output_log_probs: - out_output_log_probs = pb_utils.Tensor( - "OUT_OUTPUT_LOG_PROBS", output_log_probs.as_numpy() - ) - outputs.append(out_output_log_probs) - else: - out_output_log_probs = pb_utils.Tensor( - "OUT_OUTPUT_LOG_PROBS", np.array([[[0.0]]], dtype=np.float32) - ) - outputs.append(out_output_log_probs) - - if context_logits: - out_context_logits = pb_utils.Tensor( - "OUT_CONTEXT_LOGITS", context_logits.as_numpy() - ) - outputs.append(out_context_logits) - else: - out_context_logits = pb_utils.Tensor( - "OUT_CONTEXT_LOGITS", np.array([[[0.0]]], dtype=np.float32) - ) - outputs.append(out_context_logits) - - if generation_logits: - out_generation_logits = pb_utils.Tensor( - "OUT_GENERATION_LOGITS", generation_logits.as_numpy() - ) - outputs.append(out_generation_logits) - else: - out_generation_logits = pb_utils.Tensor( - "OUT_GENERATION_LOGITS", np.array([[[[0.0]]]], dtype=np.float32) - ) - outputs.append(out_generation_logits) - - # Create InferenceResponse. You can set an error here in case - # there was a problem with handling this inference request. - # Below is an example of how you can set errors in inference - # response: - # - # pb_utils.InferenceResponse( - # output_tensors=..., TritonError("An error occurred")) - inference_response = pb_utils.InferenceResponse(output_tensors=outputs) - responses.append(inference_response) - - # You should return a list of pb_utils.InferenceResponse. Length - # of this list must match the length of `requests` list. - return responses - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is optional. This function allows - the model to perform any necessary clean ups before exit. - """ - print("Cleaning up...") - - def _postprocessing(self, tokens_batch, sequence_lengths): - outputs = [] - for batch_idx, beam_tokens in enumerate(tokens_batch): - for beam_idx, tokens in enumerate(beam_tokens): - seq_len = sequence_lengths[batch_idx][beam_idx] - output = self.tokenizer.decode( - tokens[:seq_len], skip_special_tokens=self.skip_special_tokens - ) - outputs.append(output.encode("utf8")) - return outputs diff --git a/qa/L0_openai/openai/src/tests/tensorrtllm_models/postprocessing/config.pbtxt b/qa/L0_openai/openai/src/tests/tensorrtllm_models/postprocessing/config.pbtxt deleted file mode 100644 index dee851662d..0000000000 --- a/qa/L0_openai/openai/src/tests/tensorrtllm_models/postprocessing/config.pbtxt +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -name: "postprocessing" -backend: "python" -max_batch_size: 256 -input [ - { - name: "TOKENS_BATCH" - data_type: TYPE_INT32 - dims: [ -1, -1 ] - }, - { - name: "SEQUENCE_LENGTH" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "CUM_LOG_PROBS" - data_type: TYPE_FP32 - dims: [ -1 ] - optional: true - }, - { - name: "OUTPUT_LOG_PROBS" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - optional: true - }, - { - name: "CONTEXT_LOGITS" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - optional: true - }, - { - name: "GENERATION_LOGITS" - data_type: TYPE_FP32 - dims: [ -1, -1, -1 ] - optional: true - } -] -output [ - { - name: "OUTPUT" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "OUT_CUM_LOG_PROBS" - data_type: TYPE_FP32 - dims: [ -1 ] - }, - { - name: "OUT_OUTPUT_LOG_PROBS" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - }, - { - name: "OUT_CONTEXT_LOGITS" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - }, - { - name: "OUT_GENERATION_LOGITS" - data_type: TYPE_FP32 - dims: [ -1, -1, -1 ] - } -] - -parameters { - key: "tokenizer_dir" - value: { - string_value: "/tmp/engines/llama-3-8b-instruct/hf_download" - } -} - -parameters { - key: "skip_special_tokens" - value: { - string_value: "${skip_special_tokens}" - } -} - -instance_group [ - { - count: 1 - kind: KIND_CPU - } -] diff --git a/qa/L0_openai/openai/src/tests/tensorrtllm_models/preprocessing/1/model.py b/qa/L0_openai/openai/src/tests/tensorrtllm_models/preprocessing/1/model.py deleted file mode 100644 index eb4487c803..0000000000 --- a/qa/L0_openai/openai/src/tests/tensorrtllm_models/preprocessing/1/model.py +++ /dev/null @@ -1,418 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import json -from typing import List - -import numpy as np -import triton_python_backend_utils as pb_utils -from transformers import AutoTokenizer, T5Tokenizer - - -class TritonPythonModel: - """Your Python model must use the same class name. Every Python model - that is created must have "TritonPythonModel" as the class name. - """ - - def initialize(self, args): - """`initialize` is called only once when the model is being loaded. - Implementing `initialize` function is optional. This function allows - the model to initialize any state associated with this model. - Parameters - ---------- - args : dict - Both keys and values are strings. The dictionary keys and values are: - * model_config: A JSON string containing the model configuration - * model_instance_kind: A string containing model instance kind - * model_instance_device_id: A string containing model instance device ID - * model_repository: Model repository path - * model_version: Model version - * model_name: Model name - """ - # Parse model configs - model_config = json.loads(args["model_config"]) - tokenizer_dir = model_config["parameters"]["tokenizer_dir"]["string_value"] - - add_special_tokens = model_config["parameters"].get("add_special_tokens") - if add_special_tokens is not None: - add_special_tokens_str = add_special_tokens["string_value"].lower() - if add_special_tokens_str in [ - "true", - "false", - "1", - "0", - "t", - "f", - "y", - "n", - "yes", - "no", - ]: - self.add_special_tokens = add_special_tokens_str in [ - "true", - "1", - "t", - "y", - "yes", - ] - else: - print( - f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens' correctly (set value is {add_special_tokens['string_value']}). Set it as True by default." - ) - self.add_special_tokens = True - else: - print( - f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens'. Set it as True by default." - ) - self.add_special_tokens = True - - self.tokenizer = AutoTokenizer.from_pretrained( - tokenizer_dir, legacy=False, padding_side="left", trust_remote_code=True - ) - if isinstance(self.tokenizer, T5Tokenizer): - self.tokenizer_bos_id = self.tokenizer.sp_model.bos_id() - - if not self.tokenizer.pad_token: - self.tokenizer.pad_token = self.tokenizer.eos_token - - self.tokenizer_end_id = self.tokenizer.encode( - self.tokenizer.eos_token, add_special_tokens=False - )[0] - self.tokenizer_pad_id = self.tokenizer.encode( - self.tokenizer.pad_token, add_special_tokens=False - )[0] - - # Parse model output configs and convert Triton types to numpy types - output_names = [ - "INPUT_ID", - "DECODER_INPUT_ID", - "REQUEST_INPUT_LEN", - "REQUEST_DECODER_INPUT_LEN", - "BAD_WORDS_IDS", - "STOP_WORDS_IDS", - "OUT_END_ID", - "OUT_PAD_ID", - ] - input_names = ["EMBEDDING_BIAS_WORDS", "EMBEDDING_BIAS_WEIGHTS"] - for input_name in input_names: - setattr( - self, - input_name.lower() + "_dtype", - pb_utils.triton_string_to_numpy( - pb_utils.get_input_config_by_name(model_config, input_name)[ - "data_type" - ] - ), - ) - - for output_name in output_names: - setattr( - self, - output_name.lower() + "_dtype", - pb_utils.triton_string_to_numpy( - pb_utils.get_output_config_by_name(model_config, output_name)[ - "data_type" - ] - ), - ) - - def execute(self, requests): - """`execute` must be implemented in every Python model. `execute` - function receives a list of pb_utils.InferenceRequest as the only - argument. This function is called when an inference is requested - for this model. Depending on the batching configuration (e.g. Dynamic - Batching) used, `requests` may contain multiple requests. Every - Python model, must create one pb_utils.InferenceResponse for every - pb_utils.InferenceRequest in `requests`. If there is an error, you can - set the error argument when creating a pb_utils.InferenceResponse. - Parameters - ---------- - requests : list - A list of pb_utils.InferenceRequest - Returns - ------- - list - A list of pb_utils.InferenceResponse. The length of this list must - be the same as `requests` - """ - - responses = [] - - # Every Python backend must iterate over everyone of the requests - # and create a pb_utils.InferenceResponse for each of them. - logger = pb_utils.Logger - for idx, request in enumerate(requests): - # Get input tensors - query = pb_utils.get_input_tensor_by_name(request, "QUERY").as_numpy() - decoder_query = pb_utils.get_input_tensor_by_name(request, "DECODER_QUERY") - if decoder_query is not None: - decoder_query = decoder_query.as_numpy() - - batch_dim = query.shape[0] - if batch_dim != 1: - err_str = ( - "Inflight batching backend expects requests with batch size of 1." - ) - logger.log_error(err_str) - responses.append( - pb_utils.InferenceResponse( - output_tensors=[], error=pb_utils.TritonError(err_str) - ) - ) - continue - - request_output_len = pb_utils.get_input_tensor_by_name( - request, "REQUEST_OUTPUT_LEN" - ).as_numpy() - - bad_words_dict = pb_utils.get_input_tensor_by_name( - request, "BAD_WORDS_DICT" - ) - if bad_words_dict is not None: - bad_words_dict = bad_words_dict.as_numpy() - - stop_words_dict = pb_utils.get_input_tensor_by_name( - request, "STOP_WORDS_DICT" - ) - if stop_words_dict is not None: - stop_words_dict = stop_words_dict.as_numpy() - - embedding_bias_words = pb_utils.get_input_tensor_by_name( - request, "EMBEDDING_BIAS_WORDS" - ) - if embedding_bias_words is not None: - embedding_bias_words = embedding_bias_words.as_numpy() - - embedding_bias_weights = pb_utils.get_input_tensor_by_name( - request, "EMBEDDING_BIAS_WEIGHTS" - ) - if embedding_bias_weights is not None: - embedding_bias_weights = embedding_bias_weights.as_numpy() - - # Take the end_id from the input tensors - # If not specified, use tokenizer to get end_id - end_id = pb_utils.get_input_tensor_by_name(request, "END_ID") - if end_id is not None: - end_id = end_id.as_numpy() - else: - end_id = [[self.tokenizer_end_id]] - - # Take the pad_id from the input tensors - # If not specified, use tokenizer to get pad_id - pad_id = pb_utils.get_input_tensor_by_name(request, "PAD_ID") - if pad_id is not None: - pad_id = pad_id.as_numpy() - else: - pad_id = [[self.tokenizer_pad_id]] - - # Preprocessing input data. - input_id, request_input_len = self._create_request(query) - if decoder_query is not None: - decoder_input_id, request_decoder_input_len = self._create_request( - decoder_query - ) - else: - decoder_input_id = pad_id * np.ones((1, 1), np.int32) - request_decoder_input_len = 1 * np.ones((1, 1), np.int32) - - bad_words = self._to_word_list_format(bad_words_dict) - stop_words = self._to_word_list_format(stop_words_dict) - - embedding_bias = self._get_embedding_bias( - embedding_bias_words, - embedding_bias_weights, - self.embedding_bias_weights_dtype, - ) - - # Create output tensors. You need pb_utils.Tensor - # objects to create pb_utils.InferenceResponse. - input_id_tensor = pb_utils.Tensor( - "INPUT_ID", input_id.astype(self.input_id_dtype) - ) - request_input_len_tensor = pb_utils.Tensor( - "REQUEST_INPUT_LEN", - request_input_len.astype(self.request_input_len_dtype), - ) - decoder_input_id_tensor = pb_utils.Tensor( - "DECODER_INPUT_ID", decoder_input_id.astype(self.decoder_input_id_dtype) - ) - request_decoder_input_len_tensor = pb_utils.Tensor( - "REQUEST_DECODER_INPUT_LEN", - request_decoder_input_len.astype(self.request_decoder_input_len_dtype), - ) - request_output_len_tensor = pb_utils.Tensor( - "REQUEST_OUTPUT_LEN", request_output_len - ) - bad_words_ids_tensor = pb_utils.Tensor("BAD_WORDS_IDS", bad_words) - stop_words_ids_tensor = pb_utils.Tensor("STOP_WORDS_IDS", stop_words) - embedding_bias_tensor = pb_utils.Tensor("EMBEDDING_BIAS", embedding_bias) - end_id_tensor = pb_utils.Tensor( - "OUT_END_ID", np.array(end_id, dtype=np.int32) - ) - pad_id_tensor = pb_utils.Tensor( - "OUT_PAD_ID", np.array(pad_id, dtype=np.int32) - ) - - inference_response = pb_utils.InferenceResponse( - output_tensors=[ - input_id_tensor, - decoder_input_id_tensor, - bad_words_ids_tensor, - stop_words_ids_tensor, - request_input_len_tensor, - request_decoder_input_len_tensor, - request_output_len_tensor, - embedding_bias_tensor, - end_id_tensor, - pad_id_tensor, - ] - ) - responses.append(inference_response) - - # You should return a list of pb_utils.InferenceResponse. Length - # of this list must match the length of `requests` list. - return responses - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is optional. This function allows - the model to perform any necessary clean ups before exit. - """ - print("Cleaning up...") - - def _create_request(self, query): - """ - query : batch string (2D numpy array) - """ - if isinstance(self.tokenizer, T5Tokenizer): - start_ids = [ - np.array( - [self.tokenizer_bos_id] - + self.tokenizer.encode( - s[0].decode(), add_special_tokens=self.add_special_tokens - ) - ).astype(int) - for s in query - ] - else: - start_ids = [ - np.array( - self.tokenizer.encode( - s[0].decode(), add_special_tokens=self.add_special_tokens - ) - ).astype(int) - for s in query - ] - start_lengths = np.array([[len(ids)] for ids in start_ids]).astype(int) - - max_len = 0 - for seq in start_ids: - max_len = max(max_len, seq.shape[0]) - start_ids = np.stack( - [ - np.pad( - seq, - (0, max_len - seq.shape[0]), - "constant", - constant_values=(0, self.tokenizer_pad_id), - ) - for seq in start_ids - ] - ) - - return start_ids, start_lengths - - def _to_word_list_format(self, word_lists: List[List[str | bytes]]): - """ - word_lists format: - len(word_lists) == batch_size - word_lists[i] means the words associated to batch item i. A "word" may actually be any string. Like "lorem" or "lorem ipsum". - """ - assert self.tokenizer != None, "need to set tokenizer" - - if word_lists is None: - # Return an empty array of shape (1,2,0) - return np.empty([1, 2, 0], dtype="int32") - - flat_ids = [] - offsets = [] - for word_list in word_lists: - item_flat_ids = [] - item_offsets = [] - - for word in word_list: - if isinstance(word, bytes): - word = word.decode() - - ids = self.tokenizer.encode(word, add_special_tokens=False) - if len(ids) == 0: - continue - - item_flat_ids += ids - item_offsets.append(len(ids)) - - flat_ids.append(np.array(item_flat_ids)) - offsets.append(np.cumsum(np.array(item_offsets))) - - pad_to = max(1, max(len(ids) for ids in flat_ids)) - - for i, (ids, offs) in enumerate(zip(flat_ids, offsets)): - flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)), constant_values=0) - offsets[i] = np.pad(offs, (0, pad_to - len(offs)), constant_values=-1) - - return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2)) - - def _get_embedding_bias( - self, embedding_bias_words, embedding_bias_weights, bias_dtype - ): - assert self.tokenizer != None, "need to set tokenizer" - - if embedding_bias_words is None or embedding_bias_weights is None: - return np.empty([1, 0], dtype=self.embedding_bias_weights_dtype) - - batch_embedding_bias = [] - for words, weights in zip(embedding_bias_words, embedding_bias_weights): - vocab_size = self.tokenizer.vocab_size - embedding_bias = [0.0] * vocab_size - - assert len(words) == len( - weights - ), "Embedding bias words must have same dimension as embedding bias weights" - - for word, weight in zip(words, weights): - if isinstance(word, bytes): - word = word.decode() - ids = self.tokenizer.encode(word) - - if len(ids) == 0: - continue - - for id in ids: - embedding_bias[id] += weight - - batch_embedding_bias.append(np.array(embedding_bias)) - - return np.array(batch_embedding_bias, dtype=bias_dtype) diff --git a/qa/L0_openai/openai/src/tests/tensorrtllm_models/preprocessing/config.pbtxt b/qa/L0_openai/openai/src/tests/tensorrtllm_models/preprocessing/config.pbtxt deleted file mode 100644 index a262cf6983..0000000000 --- a/qa/L0_openai/openai/src/tests/tensorrtllm_models/preprocessing/config.pbtxt +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -name: "preprocessing" -backend: "python" -max_batch_size: 256 -input [ - { - name: "QUERY" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "DECODER_QUERY" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "REQUEST_OUTPUT_LEN" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "BAD_WORDS_DICT" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "STOP_WORDS_DICT" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "EMBEDDING_BIAS_WORDS" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "EMBEDDING_BIAS_WEIGHTS" - data_type: TYPE_FP32 - dims: [ -1 ] - optional: true - }, - { - name: "END_ID" - data_type: TYPE_INT32 - dims: [ -1 ] - optional: true - }, - { - name: "PAD_ID" - data_type: TYPE_INT32 - dims: [ -1 ] - optional: true - } -] -output [ - { - name: "INPUT_ID" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "REQUEST_INPUT_LEN" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "DECODER_INPUT_ID" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "REQUEST_DECODER_INPUT_LEN" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "BAD_WORDS_IDS" - data_type: TYPE_INT32 - dims: [ 2, -1 ] - }, - { - name: "STOP_WORDS_IDS" - data_type: TYPE_INT32 - dims: [ 2, -1 ] - }, - { - name: "EMBEDDING_BIAS" - data_type: TYPE_FP32 - dims: [ -1 ] - }, - { - name: "REQUEST_OUTPUT_LEN" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "OUT_END_ID" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "OUT_PAD_ID" - data_type: TYPE_INT32 - dims: [ -1 ] - } -] - -parameters { - key: "tokenizer_dir" - value: { - string_value: "/tmp/engines/llama-3-8b-instruct/hf_download" - } -} - -parameters { - key: "add_special_tokens" - value: { - string_value: "${add_special_tokens}" - } -} - -instance_group [ - { - count: 1 - kind: KIND_CPU - } -] diff --git a/qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm/1/.gitkeep b/qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm/1/.gitkeep deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm/1/model.py b/qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm/1/model.py deleted file mode 100644 index 3425a20f57..0000000000 --- a/qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm/1/model.py +++ /dev/null @@ -1,797 +0,0 @@ -import datetime -import json -import os -import time -from threading import Lock, Thread - -import numpy as np -import tensorrt_llm.bindings.executor as trtllm -import triton_python_backend_utils as pb_utils -from torch import from_numpy - - -def get_input_tensor_by_name(request, name): - tensor = pb_utils.get_input_tensor_by_name(request, name) - if tensor is None: - if name == "temperature": - print(f"Tensor for {name} is None!") - return None - return tensor.as_numpy() - - -def get_input_scalar_by_name(request, name): - tensor = get_input_tensor_by_name(request, name) - if tensor is None: - if name == "temperature": - print(f"Scalar for {name} is None!") - return None - if tensor.size != 1: - raise pb_utils.TritonModelException(f"Expected a single value for {name}") - return tensor.item() - - -def read_parameter_as_type(value, name, pytype=str): - if value == "": - return None - if value.startswith("${") and value.endswith("}"): - return None - if pytype is bool: - return value.lower() in ["1", "true"] - try: - result = pytype(value) - return result - except: - pb_utils.Logger.log_warning( - f"Could not read parameter '{name}' with value '{value}', will use default." - ) - return None - - -def get_parameter(model_config, name, pytype=str): - if name not in model_config["parameters"]: - return None - return read_parameter_as_type( - model_config["parameters"][name]["string_value"], name, pytype - ) - - -def convert_word_list(word_list): - if word_list is None: - return None - word_list = word_list.tolist() - if len(word_list) == 0 or len(word_list[0]) != 2: - raise pb_utils.TritonModelException(f"Invalid format for word list.") - words, indices = word_list[0] - result = [] - current_index = 0 - for i in indices: - if i == -1: - continue - if i > len(words): - raise pb_utils.TritonModelException(f"Invalid format for word list.") - current_word = [] - while current_index < i: - current_word.append(words[current_index]) - current_index += 1 - result.append(current_word) - return result - - -def parse_medusa_choices(medusa_choices): - if medusa_choices is None: - return None - try: - result = json.loads( - "[" + medusa_choices.replace("{", "[").replace("}", "]") + "]" - ) - assert isinstance(result, list) and len(result) > 0 - assert all([isinstance(x, list) for x in result]) - assert all([isinstance(y, int) for x in result for y in x]) - except Exception: - raise pb_utils.TritonModelException("Invalid format for medusa_choices") - return result - - -def get_sampling_config_from_request(request): - kwargs = {} - kwargs["beam_width"] = get_input_scalar_by_name(request, "beam_width") or 1 - kwargs["top_k"] = get_input_scalar_by_name(request, "runtime_top_k") - kwargs["top_p"] = get_input_scalar_by_name(request, "runtime_top_p") - kwargs["top_p"] = ( - None if kwargs["top_p"] is None or kwargs["top_p"] <= 0 else kwargs["top_p"] - ) - kwargs["random_seed"] = get_input_scalar_by_name(request, "random_seed") - kwargs["temperature"] = get_input_scalar_by_name(request, "temperature") - # print(f"=========== [DEBUG] [trtllm python runtime model.py] {kwargs['temperature']=} ==========") - kwargs["min_length"] = get_input_scalar_by_name(request, "min_length") - kwargs["repetition_penalty"] = get_input_scalar_by_name( - request, "repetition_penalty" - ) - kwargs["presence_penalty"] = get_input_scalar_by_name(request, "presence_penalty") - kwargs["frequency_penalty"] = get_input_scalar_by_name(request, "frequency_penalty") - kwargs["length_penalty"] = get_input_scalar_by_name(request, "len_penalty") - kwargs["top_p_min"] = get_input_scalar_by_name(request, "runtime_top_p_min") - kwargs["top_p_reset_ids"] = get_input_scalar_by_name( - request, "runtime_top_p_reset_ids" - ) - kwargs["top_p_decay"] = get_input_scalar_by_name(request, "runtime_top_p_decay") - kwargs["beam_search_diversity_rate"] = get_input_scalar_by_name( - request, "beam_search_diversity_rate" - ) - kwargs["early_stopping"] = get_input_scalar_by_name(request, "early_stopping") - kwargs = {k: v for k, v in kwargs.items() if v is not None} - return trtllm.SamplingConfig(**kwargs) - - -def get_output_config_from_request(request, exclude_input_from_output): - kwargs = {} - kwargs["return_log_probs"] = get_input_scalar_by_name(request, "return_log_probs") - kwargs["return_context_logits"] = get_input_scalar_by_name( - request, "return_context_logits" - ) - kwargs["return_generation_logits"] = get_input_scalar_by_name( - request, "return_generation_logits" - ) - kwargs["exclude_input_from_output"] = exclude_input_from_output - kwargs = {k: v for k, v in kwargs.items() if v is not None} - return trtllm.OutputConfig(**kwargs) - - -def get_external_draft_tokens_config_from_request(request): - kwargs = {} - draft_input_ids = get_input_tensor_by_name(request, "draft_input_ids") - if draft_input_ids is not None: - kwargs["tokens"] = draft_input_ids.tolist() - draft_logits = get_input_tensor_by_name(request, "draft_logits") - if draft_logits is not None: - kwargs["logits"] = from_numpy(draft_logits) - kwargs["acceptance_threshold"] = get_input_scalar_by_name( - request, "draft_acceptance_threshold" - ) - kwargs = {k: v for k, v in kwargs.items() if v is not None} - if len(kwargs) > 0: - return trtllm.ExternalDraftTokensConfig(**kwargs) - return None - - -def get_prompt_tuning_config_from_request(request): - # prompt_vocab_size is unused by executor. - kwargs = {} - prompt_embedding_table = get_input_tensor_by_name(request, "prompt_embedding_table") - if prompt_embedding_table is not None: - kwargs["embedding_table"] = from_numpy(prompt_embedding_table) - kwargs = {k: v for k, v in kwargs.items() if v is not None} - if len(kwargs) > 0: - return trtllm.PromptTuningConfig(**kwargs) - return None - - -def get_lora_config_from_request(request): - kwargs = {} - kwargs["task_id"] = get_input_scalar_by_name(request, "lora_task_id") - lora_weights = get_input_tensor_by_name(request, "lora_weights") - if lora_weights is not None: - kwargs["weights"] = from_numpy(lora_weights) - lora_config = get_input_tensor_by_name(request, "lora_config") - if lora_config is not None: - kwargs["config"] = from_numpy(lora_config) - kwargs = {k: v for k, v in kwargs.items() if v is not None} - if len(kwargs) > 0: - return trtllm.LoraConfig(**kwargs) - return None - - -def convert_request(request, exclude_input_from_output, decoupled): - inputs = {} - input_token_ids = get_input_tensor_by_name(request, "input_ids") - if input_token_ids is None: - raise pb_utils.TritonModelException("A value is required for input_ids") - input_token_ids = input_token_ids.tolist() - if len(input_token_ids) == 0: - raise pb_utils.TritonModelException(f"Invalid format for input_ids") - inputs["input_token_ids"] = input_token_ids[0] - # input_lengths is not not used by executor. - inputs["max_new_tokens"] = get_input_scalar_by_name(request, "request_output_len") - if inputs["max_new_tokens"] is None: - raise pb_utils.TritonModelException( - "A value is required for request_output_len" - ) - inputs["streaming"] = get_input_scalar_by_name(request, "streaming") - if inputs["streaming"] and not decoupled: - raise pb_utils.TritonModelException( - "Streaming is only supported in decoupled mode." - ) - inputs["end_id"] = get_input_scalar_by_name(request, "end_id") - inputs["pad_id"] = get_input_scalar_by_name(request, "pad_id") - inputs["stop_words"] = convert_word_list( - get_input_tensor_by_name(request, "stop_words_list") - ) - inputs["bad_words"] = convert_word_list( - get_input_tensor_by_name(request, "bad_words_list") - ) - embedding_bias = get_input_tensor_by_name(request, "embedding_bias") - if embedding_bias is not None and embedding_bias.size != 0: - inputs["embedding_bias"] = from_numpy(embedding_bias).squeeze() - - sampling_config = get_sampling_config_from_request(request) - output_config = get_output_config_from_request(request, exclude_input_from_output) - external_draft_tokens_config = get_external_draft_tokens_config_from_request( - request - ) - prompt_tuning_config = get_prompt_tuning_config_from_request(request) - lora_config = get_lora_config_from_request(request) - - return trtllm.Request( - **inputs, - sampling_config=sampling_config, - output_config=output_config, - external_draft_tokens_config=external_draft_tokens_config, - prompt_tuning_config=prompt_tuning_config, - lora_config=lora_config, - ) - - -def convert_response(response): - if response.has_error(): - return ( - pb_utils.InferenceResponse( - output_tensors=[], error=pb_utils.TritonError(response.error_msg) - ), - True, - ) - result = response.result - beam_lengths = np.expand_dims( - np.array([len(beam) for beam in result.output_token_ids], np.int32), 0 - ) - max_beam_length = max([len(beam) for beam in result.output_token_ids]) - output_ids = np.full( - (1, len(result.output_token_ids), max_beam_length), -1, np.int32 - ) - for idx, beam in enumerate(result.output_token_ids): - output_ids[0, idx, : len(beam)] = beam - output_tensors = [ - pb_utils.Tensor("output_ids", output_ids), - pb_utils.Tensor("sequence_length", beam_lengths), - ] - output_tensors.append( - pb_utils.Tensor( - "cum_log_probs", - np.expand_dims(np.array(result.cum_log_probs, np.float32), 0) - if result.cum_log_probs is not None - else np.zeros((1, 1), np.float32), - ) - ) - output_tensors.append( - pb_utils.Tensor( - "output_log_probs", - np.expand_dims(np.array(result.log_probs, np.float32), 0) - if result.log_probs is not None - else np.zeros((1, 1, 1), np.float32), - ) - ) - output_tensors.append( - pb_utils.Tensor( - "context_logits", - np.expand_dims(np.array(result.context_logits, np.float32), 0) - if result.context_logits is not None - else np.zeros((1, 1, 1), np.float32), - ) - ) - output_tensors.append( - pb_utils.Tensor( - "generation_logits", - np.expand_dims(np.array(result.generation_logits, np.float32), 0) - if result.generation_logits is not None - else np.zeros((1, 1, 1, 1), np.float32), - ) - ) - return pb_utils.InferenceResponse(output_tensors), result.is_final - - -def convert_scheduler_policy(batch_scheduler_policy: str): - if batch_scheduler_policy.lower() == "max_utilization": - return trtllm.CapacitySchedulerPolicy.MAX_UTILIZATION - elif batch_scheduler_policy.lower() == "guaranteed_no_evict": - return trtllm.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT - raise pb_utils.TritonModelException( - f"batch_scheduler_policy value of '{batch_scheduler_policy}' is not supported." - ) - - -def convert_batching_type(gpt_model_type: str): - if gpt_model_type is None: - return None - if ( - gpt_model_type.lower() == "inflight_fused_batching" - or gpt_model_type.lower() == "inflight_batching" - ): - return trtllm.BatchingType.INFLIGHT - elif gpt_model_type.lower() == "v1": - return trtllm.BatchingType.STATIC - raise pb_utils.TritonModelException( - f"gpt_model_type value of '{gpt_model_type}' is not supported." - ) - - -def convert_decoding_mode(decoding_mode: str): - if decoding_mode is None: - return None - elif decoding_mode == "auto": - return trtllm.DecodingMode.Auto() - elif decoding_mode == "top_k": - return trtllm.DecodingMode.TopK() - elif decoding_mode == "top_p": - return trtllm.DecodingMode.TopP() - elif decoding_mode == "top_k_top_p": - return trtllm.DecodingMode.TopKTopP() - elif decoding_mode == "beam_search": - return trtllm.DecodingMode.BeamSearch() - elif decoding_mode == "medusa": - return trtllm.DecodingMode.Medusa() - raise pb_utils.TritonModelException( - f"decoding_mode value of '{decoding_mode}' is not supported." - ) - - -def convert_timestamp_to_seconds(timestamp: str): - return int(datetime.datetime.strptime(timestamp, "%m-%d-%Y %H:%M:%S").timestamp()) - - -class TritonPythonModel: - """Your Python model must use the same class name. Every Python model - that is created must have "TritonPythonModel" as the class name. - """ - - def get_scheduler_config(self, model_config): - batch_scheduler_policy = get_parameter(model_config, "batch_scheduler_policy") - if batch_scheduler_policy is None: - return trtllm.SchedulerConfig() - return trtllm.SchedulerConfig(convert_scheduler_policy(batch_scheduler_policy)) - - def get_kv_cache_config(self, model_config): - kwargs = { - "enable_block_reuse": get_parameter( - model_config, "enable_kv_cache_reuse", bool - ), - "max_tokens": get_parameter( - model_config, "max_tokens_in_paged_kv_cache", int - ), - "sink_token_length": get_parameter(model_config, "sink_token_length", int), - "max_attention_window": get_parameter( - model_config, "max_attention_window_size", int - ), - "free_gpu_memory_fraction": get_parameter( - model_config, "kv_cache_free_gpu_mem_fraction", float - ), - "host_cache_size": get_parameter( - model_config, "kv_cache_host_memory_bytes", int - ), - "onboard_blocks": get_parameter( - model_config, "kv_cache_onboard_blocks", bool - ), - } - kwargs = {k: v for k, v in kwargs.items() if v is not None} - return trtllm.KvCacheConfig(**kwargs) - - def get_parallel_config(self, model_config): - kwargs = {} - gpu_device_ids = get_parameter(model_config, "gpu_device_ids") - if gpu_device_ids: - kwargs["device_ids"] = [int(x) for x in gpu_device_ids.split(",")] - self.use_orchestrator_mode = os.environ.get("TRTLLM_ORCHESTRATOR", "0") == "1" - if self.use_orchestrator_mode: - kwargs["communication_mode"] = trtllm.CommunicationMode.ORCHESTRATOR - worker_path = get_parameter(model_config, "worker_path") - if worker_path is not None: - raise pb_utils.TritonModelException( - "worker_path parameter is specified, but this is no longer supported. Please specify executor_worker_path instead to specify the location of the trtllmExecutorWorker executable." - ) - executor_worker_path = get_parameter(model_config, "executor_worker_path") - kwargs["orchestrator_config"] = trtllm.OrchestratorConfig( - True, executor_worker_path - ) - if len(kwargs) > 0: - return trtllm.ParallelConfig(**kwargs) - return None - - def get_peft_cache_config(self, model_config): - kwargs = { - "optimal_adapter_size": get_parameter( - model_config, "lora_cache_optimal_adapter_size", int - ), - "max_adapter_size": get_parameter( - model_config, "lora_cache_max_adapter_size", int - ), - "device_cache_percent": get_parameter( - model_config, "lora_cache_gpu_memory_fraction", float - ), - "host_cache_size": get_parameter( - model_config, "lora_cache_host_memory_bytes", int - ), - } - kwargs = {k: v for k, v in kwargs.items() if v is not None} - return trtllm.PeftCacheConfig(**kwargs) - - def get_decoding_config(self, model_config): - kwargs = { - "medusa_choices": parse_medusa_choices( - get_parameter(model_config, "medusa_choices") - ), - "decoding_mode": convert_decoding_mode( - get_parameter(model_config, "decoding_mode") - ), - } - print(kwargs) - kwargs = {k: v for k, v in kwargs.items() if v is not None} - return trtllm.DecodingConfig(**kwargs) - - def get_executor_config(self, model_config): - kwargs = { - "max_beam_width": get_parameter(model_config, "max_beam_width", int), - "scheduler_config": self.get_scheduler_config(model_config), - "kv_cache_config": self.get_kv_cache_config(model_config), - "enable_chunked_context": get_parameter( - model_config, "enable_chunked_context", bool - ), - "normalize_log_probs": get_parameter( - model_config, "normalize_log_probs", bool - ), - "batching_type": convert_batching_type( - get_parameter(model_config, "gpt_model_type") - ), - "parallel_config": self.get_parallel_config(model_config), - "peft_cache_config": self.get_peft_cache_config(model_config), - "decoding_config": self.get_decoding_config(model_config), - } - kwargs = {k: v for k, v in kwargs.items() if v is not None} - return trtllm.ExecutorConfig(**kwargs) - - def create_metrics(self, model: str, version: str, is_v1_model: bool): - self.request_metric_family = pb_utils.MetricFamily( - name="nv_trt_llm_request_metrics", - description="TRT LLM request metrics", - kind=pb_utils.MetricFamily.GAUGE, - ) - self.runtime_memory_metric_family = pb_utils.MetricFamily( - name="nv_trt_llm_runtime_memory_metrics", - description="TRT LLM runtime memory metrics", - kind=pb_utils.MetricFamily.GAUGE, - ) - self.kv_cache_metric_family = pb_utils.MetricFamily( - name="nv_trt_llm_kv_cache_block_metrics", - description="TRT LLM KV cache block metrics", - kind=pb_utils.MetricFamily.GAUGE, - ) - model_type = "v1" if is_v1_model else "inflight_batcher" - self.model_type_metric_family = pb_utils.MetricFamily( - name=f"nv_trt_llm_{model_type}_metrics", - description=f"TRT LLM {model_type}-specific metrics", - kind=pb_utils.MetricFamily.GAUGE, - ) - self.general_metric_family = pb_utils.MetricFamily( - name="nv_trt_llm_general_metrics", - description="General TRT LLM metrics", - kind=pb_utils.MetricFamily.GAUGE, - ) - common_labels = {"model": model, "version": version} - self.all_metrics = { - # Request metrics - "num_active_requests": self.request_metric_family.Metric( - labels={"request_type": "active", **common_labels} - ), - "max_num_active_requests": self.request_metric_family.Metric( - labels={"request_type": "max", **common_labels} - ), - "num_scheduled_requests": self.request_metric_family.Metric( - labels={"request_type": "scheduled", **common_labels} - ), - "num_context_requests": self.request_metric_family.Metric( - labels={"request_type": "context", **common_labels} - ), - # Runtime metrics - "cpu_mem_usage": self.runtime_memory_metric_family.Metric( - labels={"memory_type": "cpu", **common_labels} - ), - "gpu_mem_usage": self.runtime_memory_metric_family.Metric( - labels={"memory_type": "gpu", **common_labels} - ), - "pinned_mem_usage": self.runtime_memory_metric_family.Metric( - labels={"memory_type": "pinned", **common_labels} - ), - # KV cache metrics - "max_num_blocks": self.kv_cache_metric_family.Metric( - labels={"kv_cache_block_type": "max", **common_labels} - ), - "free_num_blocks": self.kv_cache_metric_family.Metric( - labels={"kv_cache_block_type": "free", **common_labels} - ), - "used_num_blocks": self.kv_cache_metric_family.Metric( - labels={"kv_cache_block_type": "used", **common_labels} - ), - "tokens_per_block": self.kv_cache_metric_family.Metric( - labels={"kv_cache_block_type": "tokens_per", **common_labels} - ), - # General metrics - "timestamp": self.general_metric_family.Metric( - labels={"general_type": "timestamp", **common_labels} - ), - "iter": self.general_metric_family.Metric( - labels={"general_type": "iteration_counter", **common_labels} - ), - } - if is_v1_model: - self.all_metrics.update( - { - "num_ctx_tokens": self.model_type_metric_family.Metric( - labels={ - "v1_specific_metric": "total_context_tokens", - **common_labels, - } - ), - "num_gen_tokens": self.model_type_metric_family.Metric( - labels={ - "v1_specific_metric": "total_generation_tokens", - **common_labels, - } - ), - "empty_gen_slots": self.model_type_metric_family.Metric( - labels={ - "v1_specific_metric": "empty_generation_slots", - **common_labels, - } - ), - } - ) - else: - self.all_metrics.update( - { - "num_ctx_tokens": self.model_type_metric_family.Metric( - labels={ - "inflight_batcher_specific_metric": "total_context_tokens", - **common_labels, - } - ), - "num_gen_requests": self.model_type_metric_family.Metric( - labels={ - "inflight_batcher_specific_metric": "generation_requests", - **common_labels, - } - ), - "micro_batch_id": self.model_type_metric_family.Metric( - labels={ - "inflight_batcher_specific_metric": "micro_batch_id", - **common_labels, - } - ), - "num_paused_requests": self.model_type_metric_family.Metric( - labels={ - "inflight_batcher_specific_metric": "paused_requests", - **common_labels, - } - ), - } - ) - - def initialize(self, args): - """`initialize` is called only once when the model is being loaded. - Implementing `initialize` function is optional. This function allows - the model to initialize any state associated with this model. - - Parameters - ---------- - args : dict - Both keys and values are strings. The dictionary keys and values are: - * model_config: A JSON string containing the model configuration - * model_instance_kind: A string containing model instance kind - * model_instance_device_id: A string containing model instance device ID - * model_repository: Model repository path - * model_version: Model version - * model_name: Model name - """ - model_config = json.loads(args["model_config"]) - gpt_model_path = get_parameter(model_config, "gpt_model_path") - if get_parameter(model_config, "enable_trt_overlap", bool): - raise pb_utils.TritonModelException( - f"enable_trt_overlap=true is not supported." - ) - self.exclude_input_from_output = get_parameter( - model_config, "exclude_input_in_output", bool - ) - executor_config = self.get_executor_config(model_config) - self.executor = trtllm.Executor( - gpt_model_path, trtllm.ModelType.DECODER_ONLY, executor_config - ) - self.decoupled = pb_utils.using_decoupled_model_transaction_policy(model_config) - self.cancellation_check_period_ms = ( - get_parameter(model_config, "cancellation_check_period_ms", int) or 100 - ) - self.stats_check_period_ms = ( - get_parameter(model_config, "stats_check_period_ms", int) or 100 - ) - - if not self.decoupled: - raise pb_utils.TritonModelException( - "Please enable decoupled transaction policy in the model configuration to serve this model" - ) - - self.create_metrics( - args["model_name"], - args["model_version"], - is_v1_model=executor_config.batching_type == trtllm.BatchingType.STATIC, - ) - self.triton_id_to_req_id = {} - self.req_id_to_response_sender = {} - self.lock = Lock() - self.running = False - self.awaiter_thread = Thread(target=self.awaiter_loop) - self.cancellation_thread = Thread(target=self.cancellation_loop) - self.metrics_thread = Thread(target=self.metrics_loop) - if self.executor.can_enqueue_requests(): - self.running = True - self.awaiter_thread.start() - self.cancellation_thread.start() - self.metrics_thread.start() - else: - # In leader mode, worker ranks will wait here until leader is done. - self.executor.shutdown() - - def handle_stop_request(self, triton_id, response_sender): - if triton_id is None or triton_id == "": - response_sender.send( - pb_utils.InferenceResponse( - error=pb_utils.TritonError( - "A request id must be provided for request cancellation" - ) - ), - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL, - ) - return - - if triton_id in self.triton_id_to_req_id: - req_id = self.triton_id_to_req_id[triton_id] - self.executor.cancel_request(req_id) - - response_sender.send( - pb_utils.InferenceResponse(), - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL, - ) - - def execute(self, requests): - """`execute` must be implemented in every Python model. `execute` - function receives a list of pb_utils.InferenceRequest as the only - argument. This function is called when an inference is requested - for this model. - - Parameters - ---------- - requests : list - A list of pb_utils.InferenceRequest - - Returns - ------- - list - A list of pb_utils.InferenceResponse. The length of this list must - be the same as `requests` - """ - if not self.executor.can_enqueue_requests(): - return - - # Convert to executor requests. - triton_requests = [] - executor_requests = [] - for request in requests: - response_sender = request.get_response_sender() - if get_input_scalar_by_name(request, "stop"): - self.handle_stop_request(request.request_id(), response_sender) - else: - try: - converted = convert_request( - request, self.exclude_input_from_output, self.decoupled - ) - except Exception as e: - response_sender.send( - pb_utils.InferenceResponse( - error=pb_utils.TritonError( - f"An error occurred when processing the input values for request id {request.request_id()}, the error was '{e}'" - ) - ), - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL, - ) - else: - triton_requests.append(request) - executor_requests.append(converted) - - with self.lock: - request_ids = self.executor.enqueue_requests(executor_requests) - for req_id, request in zip(request_ids, triton_requests): - triton_id = request.request_id() - self.req_id_to_response_sender[req_id] = ( - triton_id, - request.get_response_sender(), - ) - self.triton_id_to_req_id[triton_id] = req_id - return None - - def awaiter_loop(self): - """Gets responses from executor and returns the results.""" - while self.running: - for response in self.executor.await_responses( - timeout=datetime.timedelta(milliseconds=1) - ): - req_id = response.request_id - with self.lock: - if req_id not in self.req_id_to_response_sender: - continue - triton_id, response_sender = self.req_id_to_response_sender[req_id] - - triton_response, is_final = convert_response(response) - response_sender.send( - triton_response, - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL - if is_final - else 0, - ) - - if is_final: - with self.lock: - del self.triton_id_to_req_id[triton_id] - del self.req_id_to_response_sender[req_id] - # Remove local reference so response_sender can be cleaned properly. - del response_sender - - def cancellation_loop(self): - """Checks if any pending requests have been cancelled.""" - while self.running: - time.sleep(self.cancellation_check_period_ms / 1000.0) - with self.lock: - for req_id, ( - triton_id, - response_sender, - ) in self.req_id_to_response_sender.items(): - if response_sender.is_cancelled(): - self.executor.cancel_request(req_id) - # Remove local reference so response_sender can be cleaned properly. - del response_sender - - def metrics_loop(self): - """Updates triton metrics using stats from the executor.""" - while self.running: - time.sleep(self.stats_check_period_ms / 1000.0) - for stat in self.executor.get_latest_iteration_stats(): - try: - for key, metric in self.all_metrics.items(): - value = None - if hasattr(stat, key): - value = getattr(stat, key) - elif stat.kv_cache_stats is not None and hasattr( - stat.kv_cache_stats, key - ): - value = getattr(stat.kv_cache_stats, key) - elif stat.static_batching_stats is not None and hasattr( - stat.static_batching_stats, key - ): - value = getattr(stat.static_batching_stats, key) - elif stat.inflight_batching_stats is not None and hasattr( - stat.inflight_batching_stats, key - ): - value = getattr(stat.inflight_batching_stats, key) - if value is not None: - if key == "timestamp": - value = convert_timestamp_to_seconds(value) - metric.set(value) - else: - pb_utils.Logger.log_warn(f'Metric "{key}" not found.') - except Exception as e: - pb_utils.Logger.log_warn(f"Error while processing metrics: {e}") - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is optional. This function allows - the model to perform any necessary clean ups before exit. - """ - if self.executor.can_enqueue_requests(): - self.running = False - self.awaiter_thread.join() - self.cancellation_thread.join() - self.metrics_thread.join() - self.executor.shutdown() diff --git a/qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm/config.pbtxt b/qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm/config.pbtxt deleted file mode 100644 index 7c9f294b89..0000000000 --- a/qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm/config.pbtxt +++ /dev/null @@ -1,542 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -name: "tensorrt_llm" -backend: "tensorrtllm" -#backend: "python" -max_batch_size: 256 - -model_transaction_policy { - decoupled: True -} - -dynamic_batching { - preferred_batch_size: [ 256 ] - max_queue_delay_microseconds: 1000 -} - -input [ - { - name: "input_ids" - data_type: TYPE_INT32 - dims: [ -1 ] - allow_ragged_batch: true - }, - { - name: "input_lengths" - data_type: TYPE_INT32 - dims: [ 1 ] - reshape: { shape: [ ] } - }, - { - name: "request_output_len" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "draft_input_ids" - data_type: TYPE_INT32 - dims: [ -1 ] - optional: true - allow_ragged_batch: true - }, - { - name: "decoder_input_ids" - data_type: TYPE_INT32 - dims: [ -1 ] - optional: true - allow_ragged_batch: true - }, - { - name: "decoder_input_lengths" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - reshape: { shape: [ ] } - }, - { - name: "draft_logits" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - optional: true - allow_ragged_batch: true - }, - { - name: "draft_acceptance_threshold" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "end_id" - data_type: TYPE_INT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "pad_id" - data_type: TYPE_INT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "stop_words_list" - data_type: TYPE_INT32 - dims: [ 2, -1 ] - optional: true - allow_ragged_batch: true - }, - { - name: "bad_words_list" - data_type: TYPE_INT32 - dims: [ 2, -1 ] - optional: true - allow_ragged_batch: true - }, - { - name: "embedding_bias" - data_type: TYPE_FP32 - dims: [ -1 ] - optional: true - allow_ragged_batch: true - }, - { - name: "beam_width" - data_type: TYPE_INT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "temperature" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "runtime_top_k" - data_type: TYPE_INT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "runtime_top_p" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "runtime_top_p_min" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "runtime_top_p_decay" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "runtime_top_p_reset_ids" - data_type: TYPE_INT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "len_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "early_stopping" - data_type: TYPE_BOOL - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "repetition_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "min_length" - data_type: TYPE_INT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "beam_search_diversity_rate" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "presence_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "frequency_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "random_seed" - data_type: TYPE_UINT64 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "return_log_probs" - data_type: TYPE_BOOL - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "return_context_logits" - data_type: TYPE_BOOL - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "return_generation_logits" - data_type: TYPE_BOOL - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "stop" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "streaming" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "prompt_embedding_table" - data_type: TYPE_FP16 - dims: [ -1, -1 ] - optional: true - allow_ragged_batch: true - }, - { - name: "prompt_vocab_size" - data_type: TYPE_INT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - # the unique task ID for the given LoRA. - # To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given. - # The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`. - # If the cache is full the oldest LoRA will be evicted to make space for new ones. An error is returned if `lora_task_id` is not cached. - { - name: "lora_task_id" - data_type: TYPE_UINT64 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - # weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ] - # where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer - # each of the in / out tensors are first flattened and then concatenated together in the format above. - # D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out. - { - name: "lora_weights" - data_type: TYPE_FP16 - dims: [ -1, -1 ] - optional: true - allow_ragged_batch: true - }, - # module identifier (same size a first dimension of lora_weights) - # See LoraModule::ModuleType for model id mapping - # - # "attn_qkv": 0 # compbined qkv adapter - # "attn_q": 1 # q adapter - # "attn_k": 2 # k adapter - # "attn_v": 3 # v adapter - # "attn_dense": 4 # adapter for the dense layer in attention - # "mlp_h_to_4h": 5 # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection - # "mlp_4h_to_h": 6 # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection - # "mlp_gate": 7 # for llama2 adapter for gated mlp later after attention / RMSNorm: gate - # - # last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ] - { - name: "lora_config" - data_type: TYPE_INT32 - dims: [ -1, 3 ] - optional: true - allow_ragged_batch: true - } -] -output [ - { - name: "output_ids" - data_type: TYPE_INT32 - dims: [ -1, -1 ] - }, - { - name: "sequence_length" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "cum_log_probs" - data_type: TYPE_FP32 - dims: [ -1 ] - }, - { - name: "output_log_probs" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - }, - { - name: "context_logits" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - }, - { - name: "generation_logits" - data_type: TYPE_FP32 - dims: [ -1, -1, -1 ] - } -] -instance_group [ - { - count: 1 - kind : KIND_CPU - } -] -parameters: { - key: "max_beam_width" - value: { - string_value: "${max_beam_width}" - } -} -parameters: { - key: "FORCE_CPU_ONLY_INPUT_TENSORS" - value: { - string_value: "no" - } -} -parameters: { - key: "gpt_model_type" - value: { - string_value: "inflight_fused_batching" - } -} -parameters: { - key: "gpt_model_path" - value: { - string_value: "/tmp/engines/llama-3-8b-instruct" - } -} -parameters: { - key: "encoder_model_path" - value: { - string_value: "${encoder_engine_dir}" - } -} -parameters: { - key: "max_tokens_in_paged_kv_cache" - value: { - string_value: "${max_tokens_in_paged_kv_cache}" - } -} -parameters: { - key: "max_attention_window_size" - value: { - string_value: "${max_attention_window_size}" - } -} -parameters: { - key: "sink_token_length" - value: { - string_value: "${sink_token_length}" - } -} -parameters: { - key: "batch_scheduler_policy" - value: { - string_value: "${batch_scheduler_policy}" - } -} -parameters: { - key: "kv_cache_free_gpu_mem_fraction" - value: { - string_value: "${kv_cache_free_gpu_mem_fraction}" - } -} -parameters: { - key: "kv_cache_host_memory_bytes" - value: { - string_value: "${kv_cache_host_memory_bytes}" - } -} -parameters: { - key: "kv_cache_onboard_blocks" - value: { - string_value: "${kv_cache_onboard_blocks}" - } -} -# enable_trt_overlap is deprecated and doesn't have any effect on the runtime -# parameters: { -# key: "enable_trt_overlap" -# value: { -# string_value: "${enable_trt_overlap}" -# } -# } -parameters: { - key: "exclude_input_in_output" - value: { - string_value: "True" - } -} -parameters: { - key: "cancellation_check_period_ms" - value: { - string_value: "${cancellation_check_period_ms}" - } -} -parameters: { - key: "stats_check_period_ms" - value: { - string_value: "${stats_check_period_ms}" - } -} -parameters: { - key: "iter_stats_max_iterations" - value: { - string_value: "${iter_stats_max_iterations}" - } -} -parameters: { - key: "request_stats_max_iterations" - value: { - string_value: "${request_stats_max_iterations}" - } -} -parameters: { - key: "enable_kv_cache_reuse" - value: { - string_value: "${enable_kv_cache_reuse}" - } -} -parameters: { - key: "normalize_log_probs" - value: { - string_value: "${normalize_log_probs}" - } -} -parameters: { - key: "enable_chunked_context" - value: { - string_value: "${enable_chunked_context}" - } -} -parameters: { - key: "gpu_device_ids" - value: { - string_value: "${gpu_device_ids}" - } -} -parameters: { - key: "lora_cache_optimal_adapter_size" - value: { - string_value: "${lora_cache_optimal_adapter_size}" - } -} -parameters: { - key: "lora_cache_max_adapter_size" - value: { - string_value: "${lora_cache_max_adapter_size}" - } -} -parameters: { - key: "lora_cache_gpu_memory_fraction" - value: { - string_value: "${lora_cache_gpu_memory_fraction}" - } -} -parameters: { - key: "lora_cache_host_memory_bytes" - value: { - string_value: "${lora_cache_host_memory_bytes}" - } -} -parameters: { - key: "decoding_mode" - value: { - string_value: "${decoding_mode}" - } -} -parameters: { - key: "executor_worker_path" - value: { - string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker" - } -} -parameters: { - key: "medusa_choices" - value: { - string_value: "${medusa_choices}" - } -} -parameters: { - key: "gpu_weights_percent" - value: { - string_value: "${gpu_weights_percent}" - } -} diff --git a/qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/decode.py b/qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/decode.py deleted file mode 100644 index c621cc14b4..0000000000 --- a/qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/decode.py +++ /dev/null @@ -1,347 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from collections.abc import Generator -from dataclasses import dataclass -from typing import Optional - -import numpy as np - - -class RequestValidationError(Exception): - pass - - -def _validate_that(condition: bool, msg: str): - if not condition: - raise RequestValidationError(msg) - - -def _validate_non_empty(data, msg: str): - _validate_that(data is not None and data.size > 0, msg) - - -def _validate_single_gt_0(data, msg: str): - _validate_non_empty(data, msg) - _validate_that(data.flatten()[0] > 0, msg) - - -def _single_value(data: Optional[np.ndarray]): - if data is None: - return None - return data.flatten()[0] - - -@dataclass -class Request: - text_input: np.ndarray = np.array([]) - decoder_text_input: np.ndarray = None - max_tokens: np.ndarray = np.array([]) - bad_words: Optional[np.ndarray] = None - stop_words: Optional[np.ndarray] = None - end_id: Optional[np.ndarray] = None - pad_id: Optional[np.ndarray] = None - top_k: Optional[np.ndarray] = None - top_p: Optional[np.ndarray] = None - temperature: Optional[np.ndarray] = None - length_penalty: Optional[np.ndarray] = None - repetition_penalty: Optional[np.ndarray] = None - min_length: Optional[np.ndarray] = None - return_log_probs: Optional[np.ndarray] = None - prompt_embedding_table: Optional[np.ndarray] = None - prompt_vocab_size: Optional[np.ndarray] = None - embedding_bias_words: Optional[np.ndarray] = None - embedding_bias_weights: Optional[np.ndarray] = None - num_draft_tokens: Optional[np.ndarray] = None - use_draft_logits: Optional[np.ndarray] = None - stream: Optional[np.ndarray] = None - beam_width: Optional[np.ndarray] = None - return_context_logits: Optional[np.ndarray] = None - return_generation_logits: Optional[np.ndarray] = None - random_seed: Optional[np.ndarray] = None - presence_penalty: Optional[np.ndarray] = None - frequency_penalty: Optional[np.ndarray] = None - - def validate(self): - _validate_non_empty(self.text_input, "text_input is required") - _validate_single_gt_0(self.max_tokens, "max_tokens must be a single value > 0") - - num_draft_tokens = _single_value(self.num_draft_tokens) - stream = _single_value(self.stream) - _single_value(self.return_generation_logits) - context_logits = _single_value(self.return_context_logits) - - if num_draft_tokens: - _validate_that( - not stream, "streaming is not supported with speculative decoding" - ) - _validate_that( - not context_logits, - "context logits are not supported with speculative decoding", - ) - - -@dataclass -class DraftRequest: - draft_input_ids: Optional[np.ndarray] = None - draft_logits: Optional[np.ndarray] = None - - -@dataclass -class PreprocResponse: - input_ids: np.ndarray = np.array([]) - decoder_input_ids: np.ndarray = None - input_lengths: np.ndarray = np.array([]) - decoder_input_lengths: np.ndarray = None - bad_words_list: Optional[np.ndarray] = None - stop_words_list: Optional[np.ndarray] = None - embedding_bias: Optional[np.ndarray] = None - end_id: Optional[np.ndarray] = None - pad_id: Optional[np.ndarray] = None - - @classmethod - def with_new_inputs( - cls, - other, - input_ids: Optional[np.ndarray] = None, - input_lengths: Optional[np.ndarray] = None, - ): - return cls( - input_ids=(input_ids if input_ids is not None else other.input_ids), - input_lengths=( - input_lengths if input_lengths is not None else other.input_lengths - ), - decoder_input_ids=other.decoder_input_ids, - decoder_input_lengths=other.decoder_input_lengths, - bad_words_list=other.bad_words_list, - stop_words_list=other.stop_words_list, - end_id=other.end_id, - pad_id=other.pad_id, - ) - - -@dataclass -class GenerationResponse: - output_ids: np.ndarray = np.array([]) - sequence_length: np.ndarray = np.array([]) - cum_log_probs: Optional[np.ndarray] = None - output_log_probs: Optional[np.ndarray] = None - context_logits: Optional[np.ndarray] = None - generation_logits: Optional[np.ndarray] = None - - -@dataclass -class Response: - text_output: np.ndarray = np.array([]) - cum_log_probs: Optional[np.ndarray] = None - output_log_probs: Optional[np.ndarray] = None - context_logits: Optional[np.ndarray] = None - generation_logits: Optional[np.ndarray] = None - - def __eq__(self, o) -> bool: - """Just for testing""" - if not isinstance(o, Response): - return False - return ( - np.array_equal(self.text_output, o.text_output) - and np.array_equal(self.cum_log_probs, o.cum_log_probs) - and np.array_equal(self.output_log_probs, o.output_log_probs) - and np.array_equal(self.context_logits, o.context_logits) - and np.array_equal(self.generation_logits, o.generation_logits) - ) - - -class Decoder: - def __init__(self, streaming=False, accumulate=False): - self._streaming = streaming - self._accumulate = accumulate - - self._accumulated_tokens = None - - def decode( - self, request: Request, speculative_decoding=False - ) -> Generator[Response, None, None]: - preproc_response = self.preprocess(request) - - # print(f"[DEBUG] Decoder.decode {request.temperature=}") - if speculative_decoding: - for gen_response in self._spec_generate(preproc_response, request): - yield self.postprocess(gen_response) - else: - if not self._streaming: - gen_response = self._generate_non_streaming(preproc_response, request) - yield self.postprocess(gen_response) - else: - for gen_response in self._generate(preproc_response, request): - yield self.postprocess(gen_response) - - def encountered_stop_words(self, input_ids, stop_words_ids): - for stop_word_ids in stop_words_ids: - if np.array_equal(input_ids[-len(stop_word_ids) :], stop_word_ids): - return True - return False - - def _spec_generate( - self, preproc: PreprocResponse, request: Request - ) -> Generator[GenerationResponse, None, None]: - prompt_input_ids: np.ndarray = preproc.input_ids[0] - input_ids: np.ndarray = prompt_input_ids - output_len: int = request.max_tokens[0][0] - last_input_ids: np.ndarray = None - draft_output_ids: np.ndarray = None - draft_logits: np.ndarray = None - - target_response: GenerationResponse = None - - cur_preproc = preproc - - counter = 0 - while True: - counter += 1 - num_draft_tokens = min( - request.num_draft_tokens[0][0], - len(prompt_input_ids) + output_len - len(input_ids) - 1, - ) - - draft_request = None - if num_draft_tokens > 0: - draft_response: GenerationResponse = self._draft_generate_non_streaming( - cur_preproc, request, num_draft_tokens - ) - seq_len: int = draft_response.sequence_length[0][0] - # [1, beamWidth, outputLength] -> [outputLen] - draft_output_ids = draft_response.output_ids[0][0] - # [1, beamWidth, outputLength, vocabSizePadded] -> [outputLength, vocabSizePadded] - if request.use_draft_logits is not None and request.use_draft_logits[0]: - if draft_response.generation_logits is not None: - draft_logits = draft_response.generation_logits[0][0] - - input_draft_tokens = draft_output_ids[len(input_ids) : seq_len] - draft_request = DraftRequest( - draft_input_ids=np.expand_dims(input_draft_tokens, 0) - ) - if request.use_draft_logits is not None and request.use_draft_logits[0]: - draft_request.draft_logits = np.expand_dims( - draft_logits[-len(input_draft_tokens) :], 0 - ) - else: - draft_request = DraftRequest() - target_response = self._generate_non_streaming( - cur_preproc, request, draft_request - ) - last_input_ids = input_ids - input_ids = target_response.output_ids[0][0] - cur_preproc = PreprocResponse.with_new_inputs( - cur_preproc, - np.expand_dims(input_ids, 0), - np.array([[len(input_ids)]], dtype=np.int32), - ) - - # Evaluate criteria to stop generation loop. - # If we've hit or exceeded the max output length, should stop - length_stop = len(input_ids) >= len(prompt_input_ids) + output_len - if length_stop: - break - # If draft and target have same outputs, should stop. Normally target should return 1 more token. - # If they are the same length, they should differ at the last token - target_draft_equal = draft_output_ids is not None and np.array_equal( - draft_output_ids, input_ids - ) - if target_draft_equal: - break - # If tokens no longer change, should stop, means we have hit early stopping - last_current_equal = np.array_equal(last_input_ids, input_ids) - if last_current_equal: - break - # Need to check if stop words was encountered - hit_stop_words = self.encountered_stop_words( - input_ids, preproc.stop_words_list[0] - ) - if hit_stop_words: - break - - yield target_response - - def _draft_generate_non_streaming( - self, preproc: PreprocResponse, request: Request, num_draft_tokens: int - ) -> GenerationResponse: - raise NotImplementedError() - - def _generate( - self, - preproc: PreprocResponse, - request: Request, - draft_request: Optional[DraftRequest] = None, - ) -> Generator[GenerationResponse, None, None]: - raise NotImplementedError() - - def _generate_non_streaming( - self, - preproc: PreprocResponse, - request: Request, - draft_request: Optional[DraftRequest] = None, - ) -> GenerationResponse: - raise NotImplementedError() - - def postprocess(self, gen_response: GenerationResponse) -> Response: - if self._accumulate and self._streaming: - new_tokens: np.ndarray = gen_response.output_ids - if new_tokens.ndim != 3: - raise Exception("Expected output_ids tensor to have 3 dims.") - if new_tokens.shape[0] != 1: - raise Exception("Expected batch size of 1") - if new_tokens.shape[1] != 1: - raise Exception( - "Accumulation of tokens is only implemented for beam width = 1" - ) - - self._accumulated_tokens = ( - new_tokens - if (self._accumulated_tokens is None) - else np.concatenate((self._accumulated_tokens, new_tokens), axis=2) - ) - sequence_lengths = np.array( - [[self._accumulated_tokens.shape[2]]], dtype=np.int32 - ) - return self._postprocess( - self._accumulated_tokens, sequence_lengths, gen_response - ) - else: - return self._postprocess(gen_response.output_ids, None, gen_response) - - def _postprocess( - self, - tokens: np.ndarray, - sequence_lengths: Optional[np.ndarray], - gen_response: GenerationResponse, - ) -> Response: - raise NotImplementedError() - - def preprocess(self, request: Request) -> PreprocResponse: - raise NotImplementedError() - - def reset_decoder(self): - self._accumulated_tokens = None diff --git a/qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/triton_decoder.py b/qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/triton_decoder.py deleted file mode 100644 index 62c06f4836..0000000000 --- a/qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/triton_decoder.py +++ /dev/null @@ -1,478 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from collections.abc import Callable -from typing import Dict, Optional - -import numpy as np -import triton_python_backend_utils as pb_utils -from lib.decode import * -from typing_extensions import override - - -class TritonDecoder(Decoder): - def __init__( - self, - streaming=False, - accumulate=False, - preproc_model_name="preprocessing", - postproc_model_name="postprocessing", - llm_model_name="tensorrt_llm", - draft_llm_model_name: Optional[str] = None, - ): - super().__init__(streaming=streaming, accumulate=accumulate) - self.preproc_model_name = preproc_model_name - self.postproc_model_name = postproc_model_name - self.llm_model_name = llm_model_name - self.draft_llm_model_name = draft_llm_model_name - - self._preproc_outputs = [ - "INPUT_ID", - "DECODER_INPUT_ID", - "REQUEST_INPUT_LEN", - "REQUEST_DECODER_INPUT_LEN", - "BAD_WORDS_IDS", - "STOP_WORDS_IDS", - "EMBEDDING_BIAS", - "OUT_PAD_ID", - "OUT_END_ID", - ] - - self._llm_outputs = [ - "output_ids", - "sequence_length", - "cum_log_probs", - "output_log_probs", - "context_logits", - "generation_logits", - ] - - self._postproc_outputs = [ - "OUTPUT", - ] - - self.input_names = [ - "text_input", - "decoder_text_input", - "max_tokens", - "bad_words", - "stop_words", - "end_id", - "pad_id", - "top_k", - "top_p", - "temperature", - "length_penalty", - "repetition_penalty", - "min_length", - "presence_penalty", - "frequency_penalty", - "random_seed", - "return_log_probs", - "return_context_logits", - "return_generation_logits", - "beam_width", - "stream", - "prompt_embedding_table", - "prompt_vocab_size", - "embedding_bias_words", - "embedding_bias_weights", - "num_draft_tokens", - "use_draft_logits", - ] - - self.__undo_reshape_whitelist = { - "max_tokens", - "end_id", - "pad_id", - "top_k", - "top_p", - "temperature", - "length_penalty", - "repetition_penalty", - "min_length", - "presence_penalty", - "frequency_penalty", - "random_seed", - "return_log_probs", - "return_context_logits", - "return_generation_logits", - "beam_width", - "stream", - "prompt_vocab_size", - "num_draft_tokens", - "use_draft_logits", - } - - def _exec_triton_request(self, request): - responses = request.exec(decoupled=True) - for r in responses: - if r.has_error(): - raise pb_utils.TritonModelException(r.error().message()) - yield r - - def _exec_triton_request_single(self, request): - responses = request.exec(decoupled=False) - if responses.has_error(): - raise pb_utils.TritonModelException(responses.error().message()) - return responses - - def create_triton_response(self, response: Response): - name_map = { - "text_output": "text_output", - "cum_log_probs": "cum_log_probs", - "output_log_probs": "output_log_probs", - "context_logits": "context_logits", - "generation_logits": "generation_logits", - } - tensors = self.create_triton_tensors(response, name_map) - return pb_utils.InferenceResponse(output_tensors=tensors) - - def convert_triton_request(self, triton_request) -> Request: - request = Request() - for triton_name in self.input_names: - tensor = pb_utils.get_input_tensor_by_name(triton_request, triton_name) - target_name = triton_name - if tensor is None: - continue - if not hasattr(request, target_name): - raise AttributeError(f"Request has no attribute '{target_name}'") - setattr(request, target_name, tensor.as_numpy()) - return request - - def convert_triton_response( - self, triton_response, response_factory: Callable, name_map=None - ): - response = response_factory() - for tensor in triton_response.output_tensors(): - if tensor is None: - continue - triton_name = tensor.name() - value = tensor.as_numpy() - target_name = triton_name - if name_map and triton_name in name_map: - target_name = name_map[triton_name] - if name_map and not triton_name in name_map: - continue - if target_name is None: - # explicitly ignore this triton input - continue - if not hasattr(response, target_name): - raise AttributeError( - f"response object has not attribute '{target_name}'" - ) - setattr(response, target_name, value) - return response - - def __undo_reshape(self, x, name): - if name in self.__undo_reshape_whitelist and len(x.shape) == 1: - # handle reshapes - return np.expand_dims(x, 0) - else: - return x - - def create_triton_tensors(self, obj, name_map: dict): - tensors = [] - for name, triton_name in name_map.items(): - if triton_name is None: - continue - value = getattr(obj, name) - if value is None: - continue - t = pb_utils.Tensor(triton_name, self.__undo_reshape(value, name)) - tensors.append(t) - return tensors - - @override - def preprocess(self, request: Request) -> PreprocResponse: - input_tensors = self._get_preproc_tensors(request) - triton_req = pb_utils.InferenceRequest( - model_name=self.preproc_model_name, - inputs=input_tensors, - requested_output_names=self._preproc_outputs, - ) - triton_output = self._exec_triton_request_single(triton_req) - return self._get_preproc_response(triton_output) - - def _get_preproc_tensors(self, request: Request): - name_map = { - "text_input": "QUERY", - "decoder_text_input": "DECODER_QUERY", - "max_tokens": "REQUEST_OUTPUT_LEN", - "bad_words": "BAD_WORDS_DICT", - "stop_words": "STOP_WORDS_DICT", - "embedding_bias_words": "EMBEDDING_BIAS_WORDS", - "embedding_bias_weights": "EMBEDDING_BIAS_WEIGHTS", - "pad_id": "PAD_ID", - "end_id": "END_ID", - } - return self.create_triton_tensors(request, name_map) - - def _get_preproc_response(self, triton_output): - name_map = { - "INPUT_ID": "input_ids", - "DECODER_INPUT_ID": "decoder_input_ids", - "REQUEST_INPUT_LEN": "input_lengths", - "REQUEST_DECODER_INPUT_LEN": "decoder_input_lengths", - "BAD_WORDS_IDS": "bad_words_list", - "STOP_WORDS_IDS": "stop_words_list", - "EMBEDDING_BIAS": "embedding_bias", - "OUT_PAD_ID": "pad_id", - "OUT_END_ID": "end_id", - } - return self.convert_triton_response(triton_output, PreprocResponse, name_map) - - @override - def _draft_generate_non_streaming( - self, preproc: PreprocResponse, request: Request, num_draft_tokens: int - ) -> GenerationResponse: - input_tensors = self._get_llm_tensors( - preproc, request, num_draft_tokens, None, True - ) - triton_req = pb_utils.InferenceRequest( - model_name=self.draft_llm_model_name, - inputs=input_tensors, - requested_output_names=self._llm_outputs, - ) - triton_response = self._exec_triton_request_single(triton_req) - llm_response = self._get_llm_response(triton_response) - return llm_response - - @override - def _generate( - self, - preproc: PreprocResponse, - request: Request, - draft_request: Optional[DraftRequest] = None, - ) -> Generator[GenerationResponse, None, None]: - input_tensors = self._get_llm_tensors(preproc, request, None, draft_request) - triton_req = pb_utils.InferenceRequest( - model_name=self.llm_model_name, - inputs=input_tensors, - requested_output_names=self._llm_outputs, - ) - for r in self._exec_triton_request(triton_req): - yield self._get_llm_response(r) - - @override - def _generate_non_streaming( - self, - preproc: PreprocResponse, - request: Request, - draft_request: Optional[DraftRequest] = None, - ) -> GenerationResponse: - input_tensors = self._get_llm_tensors(preproc, request, None, draft_request) - triton_req = pb_utils.InferenceRequest( - model_name=self.llm_model_name, - inputs=input_tensors, - requested_output_names=self._llm_outputs, - ) - r = self._exec_triton_request_single(triton_req) - return self._get_llm_response(r) - - def _get_llm_tensors( - self, - preproc: PreprocResponse, - request: Request, - num_output_tokens: Optional[int] = None, - draft_request: Optional[DraftRequest] = None, - is_draft_model_request: bool = False, - ): - tensors = [] - # print(f"[get_llm_tensors] {request.temperature=}") - tensors.extend(self._get_tensors_from_preproc(preproc)) - tensors.extend( - self._get_llm_tensors_from_request( - request, num_output_tokens, draft_request, is_draft_model_request - ) - ) - return tensors - - def _get_tensors_from_preproc(self, preproc: PreprocResponse): - name_map = { - "input_ids": "input_ids", - "decoder_input_ids": "decoder_input_ids", - "input_lengths": "input_lengths", - "bad_words_list": "bad_words_list", - "stop_words_list": "stop_words_list", - "embedding_bias": "embedding_bias", - "pad_id": "pad_id", - "end_id": "end_id", - } - return self.create_triton_tensors(preproc, name_map) - - def _get_llm_tensors_from_request( - self, - request: Request, - num_output_tokens: Optional[int] = None, - draft_request: Optional[DraftRequest] = None, - is_draft_model_request: bool = False, - ): - name_map: Dict[str, Optional[str]] = { - "beam_width": "beam_width", - "top_k": "runtime_top_k", - "top_p": "runtime_top_p", - # "temperature": "temperature", - "length_penalty": "len_penalty", - "repetition_penalty": "repetition_penalty", - "min_length": "min_length", - "presence_penalty": "presence_penalty", - "frequency_penalty": "frequency_penalty", - "random_seed": "random_seed", - "return_log_probs": "return_log_probs", - "stream": "streaming", - "prompt_embedding_table": "prompt_embedding_table", - "prompt_vocab_size": "prompt_vocab_size", - } - # print(f"[get_llm_tensors_from_request] {request.temperature=}") - temp_found = "temperature" in name_map - # print(f"[get_llm_tensors_from_request] temperature in name_map = {temp_found}") - tensors = self.create_triton_tensors(request, name_map) - - out_len = request.max_tokens[0][0] if request.max_tokens else None - if num_output_tokens is not None: - out_len = num_output_tokens - elif draft_request: - if draft_request.draft_input_ids is not None: - out_len = len(draft_request.draft_input_ids[0]) + 1 - else: - out_len = 1 - - if out_len is None: - raise Exception("Could not determine request_output_len") - else: - tensors.append( - pb_utils.Tensor( - "request_output_len", np.array([[out_len]], dtype=np.int32) - ) - ) - - if draft_request: - if draft_request.draft_input_ids is not None: - tensors.append( - pb_utils.Tensor("draft_input_ids", draft_request.draft_input_ids) - ) - if ( - draft_request.draft_logits is not None - and request.use_draft_logits is not None - and request.use_draft_logits[0] - ): - tensors.append( - pb_utils.Tensor("draft_logits", draft_request.draft_logits) - ) - - return_context_logits = False - return_generation_logits = False - if draft_request is None: - if is_draft_model_request: - return_generation_logits = ( - request.use_draft_logits[0] - if request.use_draft_logits is not None - else False - ) - else: - return_context_logits = ( - request.return_context_logits[0] - if request.return_context_logits is not None - else False - ) - return_generation_logits = ( - request.return_generation_logits[0] - if request.return_generation_logits is not None - else False - ) - - tensors.append( - pb_utils.Tensor( - "return_context_logits", np.array([[return_context_logits]]) - ) - ) - tensors.append( - pb_utils.Tensor( - "return_generation_logits", np.array([[return_generation_logits]]) - ) - ) - return tensors - - def _get_llm_response(self, triton_output): - name_map = { - "output_ids": "output_ids", - "sequence_length": "sequence_length", - "cum_log_probs": "cum_log_probs", - "output_log_probs": "output_log_probs", - "context_logits": "context_logits", - "generation_logits": "generation_logits", - } - return self.convert_triton_response(triton_output, GenerationResponse, name_map) - - def _postprocess( - self, - tokens: np.ndarray, - sequence_lengths: Optional[np.ndarray], - gen_response: GenerationResponse, - ) -> Response: - input_tensors = self._get_postproc_tensors( - tokens, sequence_lengths, gen_response - ) - triton_req = pb_utils.InferenceRequest( - model_name=self.postproc_model_name, - inputs=input_tensors, - requested_output_names=self._postproc_outputs, - ) - r = self._exec_triton_request_single(triton_req) - response = self._get_response(r, gen_response) - return response - - def _get_postproc_tensors( - self, - tokens: np.ndarray, - sequence_lengths: Optional[np.ndarray], - gen_response: GenerationResponse, - ): - tensors = [ - pb_utils.Tensor("TOKENS_BATCH", tokens), - pb_utils.Tensor( - "SEQUENCE_LENGTH", - sequence_lengths if sequence_lengths else gen_response.sequence_length, - ), - ] - return tensors - - def _get_response(self, triton_output, gen_res: GenerationResponse): - tensors = triton_output.output_tensors() - t_map = {} - for named_t in tensors: - name = named_t.name() - t = named_t.as_numpy() - t_map[name] = t - response = Response( - text_output=t_map["OUTPUT"], - cum_log_probs=gen_res.cum_log_probs, - output_log_probs=gen_res.output_log_probs, - context_logits=gen_res.context_logits, - generation_logits=gen_res.generation_logits, - ) - return response diff --git a/qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/model.py b/qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/model.py deleted file mode 100644 index 0a5d54546d..0000000000 --- a/qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm_bls/1/model.py +++ /dev/null @@ -1,137 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import json -import traceback - -import triton_python_backend_utils as pb_utils -from lib.triton_decoder import TritonDecoder - - -class TritonPythonModel: - def initialize(self, args): - # Parse model configs - model_config = json.loads(args["model_config"]) - - params = model_config["parameters"] - - accumulate_tokens_str = "" - if "accumulate_tokens" in params: - accumulate_tokens_str = params["accumulate_tokens"]["string_value"] - - self.accumulate_tokens = accumulate_tokens_str.lower() in [ - "true", - "yes", - "1", - "t", - ] - - self.decoupled = pb_utils.using_decoupled_model_transaction_policy(model_config) - - self.logger = pb_utils.Logger - - self.llm_model_name = "tensorrt_llm" - if "tensorrt_llm_model_name" in params: - self.llm_model_name = params["tensorrt_llm_model_name"]["string_value"] - self.draft_llm_model_name = None - if "tensorrt_llm_draft_model_name" in params: - self.draft_llm_model_name = params["tensorrt_llm_draft_model_name"][ - "string_value" - ] - - self.decoder = TritonDecoder( - streaming=self.decoupled, - accumulate=self.accumulate_tokens, - preproc_model_name="preprocessing", - postproc_model_name="postprocessing", - llm_model_name=self.llm_model_name, - draft_llm_model_name=self.draft_llm_model_name, - ) - - def execute(self, requests): - responses = [] - - for request in requests: - if self.decoupled: - response_sender = request.get_response_sender() - try: - req = self.decoder.convert_triton_request(request) - req.validate() - # print(f"[DEBUG] ========= [bls model.py] {req.temperature=} ===========") - speculative_decode = ( - req.num_draft_tokens is not None and req.num_draft_tokens[0][0] > 0 - ) - if speculative_decode and ( - self.draft_llm_model_name is None or self.draft_llm_model_name == "" - ): - raise Exception( - "cannot perform speculative decoding without draft model" - ) - res_gen = self.decoder.decode( - req, speculative_decoding=speculative_decode - ) - - for res in res_gen: - triton_response = self.decoder.create_triton_response(res) - if self.decoupled: - response_sender.send(triton_response) - else: - responses.append(triton_response) - - if self.decoupled: - response_sender.send( - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL - ) - - except Exception: - self.logger.log_error(traceback.format_exc()) - # If encountering an error, send a response with err msg - error_response = pb_utils.InferenceResponse( - output_tensors=[], - error=pb_utils.TritonError(traceback.format_exc()), - ) - - if self.decoupled: - response_sender.send(error_response) - response_sender.send( - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL - ) - else: - responses.append(error_response) - - self.decoder.reset_decoder() - if self.decoupled: - return None - else: - assert len(responses) == len(requests) - return responses - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is optional. This function allows - the model to perform any necessary clean ups before exit. - """ - print("Cleaning up...") diff --git a/qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm_bls/config.pbtxt b/qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm_bls/config.pbtxt deleted file mode 100644 index aa3b26336c..0000000000 --- a/qa/L0_openai/openai/src/tests/tensorrtllm_models/tensorrt_llm_bls/config.pbtxt +++ /dev/null @@ -1,252 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -backend: "python" -max_batch_size: 256 - -model_transaction_policy { - decoupled: True -} - -input [ - { - name: "text_input" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "decoder_text_input" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "max_tokens" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "bad_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "stop_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "end_id" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "pad_id" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "top_k" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "top_p" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "temperature" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "length_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "repetition_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "min_length" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "presence_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "frequency_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "random_seed" - data_type: TYPE_UINT64 - dims: [ 1 ] - optional: true - }, - { - name: "return_log_probs" - data_type: TYPE_BOOL - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "return_context_logits" - data_type: TYPE_BOOL - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "return_generation_logits" - data_type: TYPE_BOOL - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "beam_width" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "stream" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "prompt_embedding_table" - data_type: TYPE_FP16 - dims: [ -1, -1 ] - optional: true - }, - { - name: "prompt_vocab_size" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "embedding_bias_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "embedding_bias_weights" - data_type: TYPE_FP32 - dims: [ -1 ] - optional: true - }, - { - name: "num_draft_tokens", - data_type: TYPE_INT32, - dims: [ 1 ] - optional: true - }, - { - name: "use_draft_logits", - data_type: TYPE_BOOL, - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - } -] -output [ - { - name: "text_output" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "cum_log_probs" - data_type: TYPE_FP32 - dims: [ -1 ] - }, - { - name: "output_log_probs" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - }, - { - name: "context_logits" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - }, - { - name: "generation_logits" - data_type: TYPE_FP32 - dims: [ -1, -1, -1 ] - } -] - -parameters: { - key: "accumulate_tokens" - value: { - string_value: "${accumulate_tokens}" - } -} -parameters: { - key: "tensorrt_llm_model_name" - value: { - string_value: "tensorrt_llm" - } -} -parameters: { - key: "tensorrt_llm_draft_model_name" - value: { - string_value: "" - } -} - -instance_group [ - { - count: 1 - kind : KIND_CPU - } -] diff --git a/qa/L0_openai/openai/src/tests/test_chat_completions.py b/qa/L0_openai/openai/src/tests/test_chat_completions.py deleted file mode 100644 index d02a127412..0000000000 --- a/qa/L0_openai/openai/src/tests/test_chat_completions.py +++ /dev/null @@ -1,447 +0,0 @@ -import copy -from pathlib import Path -from typing import List - -import pytest -from fastapi.testclient import TestClient -from src.tests.utils import setup_fastapi_app - - -class TestChatCompletions: - @pytest.fixture(scope="class") - def client(self, fastapi_client_class_scope): - yield fastapi_client_class_scope - - def test_chat_completions_defaults(self, client, model: str, messages: List[dict]): - response = client.post( - "/v1/chat/completions", - json={"model": model, "messages": messages}, - ) - - assert response.status_code == 200 - message = response.json()["choices"][0]["message"] - assert message["content"].strip() - assert message["role"] == "assistant" - # "usage" currently not supported - assert response.json()["usage"] == None - - def test_chat_completions_system_prompt(self, client, model: str): - # NOTE: Currently just sanity check that there are no issues when a - # system role is provided. There is no test logic to measure the quality - # of the response yet. - messages = [ - {"role": "system", "content": "You are a Triton Inference Server expert."}, - {"role": "user", "content": "What is machine learning?"}, - ] - - response = client.post( - "/v1/chat/completions", json={"model": model, "messages": messages} - ) - - assert response.status_code == 200 - message = response.json()["choices"][0]["message"] - assert message["content"].strip() - assert message["role"] == "assistant" - - def test_chat_completions_system_prompt_only(self, client, model: str): - # No user prompt provided - messages = [ - {"role": "system", "content": "You are a Triton Inference Server expert."} - ] - - response = client.post( - "/v1/chat/completions", json={"model": model, "messages": messages} - ) - - assert response.status_code == 200 - message = response.json()["choices"][0]["message"] - assert message["content"].strip() - assert message["role"] == "assistant" - - @pytest.mark.parametrize( - "param_key, param_value", - [ - ("temperature", 0.7), - ("max_tokens", 10), - ("top_p", 0.9), - ("frequency_penalty", 0.5), - ("presence_penalty", 0.2), - # logprobs is a boolean for chat completions - ("logprobs", True), - ("logit_bias", {"0": 0}), - ], - ) - def test_chat_completions_sampling_parameters( - self, client, param_key, param_value, model: str, messages: List[dict] - ): - response = client.post( - "/v1/chat/completions", - json={ - "model": model, - "messages": messages, - param_key: param_value, - }, - ) - - # TODO: Add support and remove this check - unsupported_parameters = ["logprobs", "logit_bias"] - if param_key in unsupported_parameters: - assert response.status_code == 400 - assert response.json()["detail"] == "logit bias and log probs not supported" - return - - assert response.status_code == 200 - assert response.json()["choices"][0]["message"]["content"] - assert response.json()["choices"][0]["message"]["role"] == "assistant" - - @pytest.mark.parametrize( - "param_key, param_value", - [ - ("temperature", 2.1), - ("temperature", -0.1), - ("max_tokens", -1), - ("top_p", 1.1), - ("frequency_penalty", 3), - ("frequency_penalty", -3), - ("presence_penalty", 2.1), - ("presence_penalty", -2.1), - ], - ) - def test_chat_completions_invalid_sampling_parameters( - self, client, param_key, param_value, model: str, messages: List[dict] - ): - response = client.post( - "/v1/chat/completions", - json={ - "model": model, - "messages": messages, - param_key: param_value, - }, - ) - - print("Response:", response.json()) - assert response.status_code == 422 - - # Simple tests to verify max_tokens roughly behaves as expected - def test_chat_completions_max_tokens( - self, client, model: str, messages: List[dict] - ): - responses = [] - payload = {"model": model, "messages": messages, "max_tokens": 1} - - # Send two requests with max_tokens = 1 to check their similarity - payload["max_tokens"] = 1 - responses.append( - client.post( - "/v1/chat/completions", - json=payload, - ) - ) - responses.append( - client.post( - "/v1/chat/completions", - json=payload, - ) - ) - # Send one requests with larger max_tokens to check its dis-similarity - payload["max_tokens"] = 100 - responses.append( - client.post( - "/v1/chat/completions", - json=payload, - ) - ) - - for response in responses: - print("Response:", response.json()) - assert response.status_code == 200 - - response1_text = ( - responses[0].json()["choices"][0]["message"]["content"].strip().split() - ) - response2_text = ( - responses[1].json()["choices"][0]["message"]["content"].strip().split() - ) - response3_text = ( - responses[2].json()["choices"][0]["message"]["content"].strip().split() - ) - # Simplification: One token shouldn't be more than one space-delimited word - assert len(response1_text) == len(response2_text) == 1 - assert len(response3_text) > len(response1_text) - - @pytest.mark.parametrize( - "temperature", - [0.0, 1.0], - ) - # Simple tests to verify temperature roughly behaves as expected - def test_chat_completions_temperature_vllm( - self, client, temperature, backend: str, model: str, messages: List[dict] - ): - if backend != "vllm": - pytest.skip(reason="Only used to test vLLM-specific temperature behavior") - - responses = [] - payload = { - "model": model, - "messages": messages, - "max_tokens": 256, - "temperature": temperature, - } - - responses.append( - client.post( - "/v1/chat/completions", - json=payload, - ) - ) - responses.append( - client.post( - "/v1/chat/completions", - json=payload, - ) - ) - - for response in responses: - print("Response:", response.json()) - assert response.status_code == 200 - - response1_text = ( - responses[0].json()["choices"][0]["message"]["content"].strip().split() - ) - response2_text = ( - responses[1].json()["choices"][0]["message"]["content"].strip().split() - ) - - # Temperature of 0.0 indicates greedy sampling, so check - # that two equivalent requests produce the same response. - if temperature == 0.0: - # NOTE: This check may be ambitious to get an exact match in all - # cases depending on how other parameter defaults are set, so - # it can probably be removed if it introduces flakiness. - assert response1_text == response2_text - # Temperature of 1.0 indicates maximum randomness, so check - # that two equivalent requests produce different responses. - elif temperature == 1.0: - assert response1_text != response2_text - # Don't bother checking values other than the extremes - else: - raise ValueError(f"Unexpected {temperature=} for this test.") - - # Remove xfail when fix is released and this test returns xpass status - @pytest.mark.xfail( - reason="TRT-LLM BLS model will ignore temperature until a later release" - ) - # Simple tests to verify temperature roughly behaves as expected - def test_chat_completions_temperature_tensorrtllm( - self, client, backend: str, model: str, messages: List[dict] - ): - if backend != "tensorrtllm": - pytest.skip( - reason="Only used to test TRT-LLM-specific temperature behavior" - ) - - responses = [] - payload1 = { - "model": model, - "messages": messages, - # Increase token length to allow more room for variability - "max_tokens": 200, - "temperature": 0.0, - # TRT-LLM requires certain settings of `top_k` / `top_p` to - # respect changes in `temperature` - "top_p": 0.5, - } - - payload2 = copy.deepcopy(payload1) - payload2["temperature"] = 1.0 - - # First 2 responses should be the same in TRT-LLM with identical payload - responses.append( - client.post( - "/v1/chat/completions", - json=payload1, - ) - ) - responses.append( - client.post( - "/v1/chat/completions", - json=payload1, - ) - ) - # Third response should differ with different temperature in payload - responses.append( - client.post( - "/v1/chat/completions", - json=payload2, - ) - ) - - for response in responses: - print("Response:", response.json()) - assert response.status_code == 200 - - response1_text = ( - responses[0].json()["choices"][0]["message"]["content"].strip().split() - ) - response2_text = ( - responses[1].json()["choices"][0]["message"]["content"].strip().split() - ) - response3_text = ( - responses[2].json()["choices"][0]["message"]["content"].strip().split() - ) - - assert response1_text == response2_text - assert response1_text != response3_text - - # Simple tests to verify random seed roughly behaves as expected - def test_chat_completions_seed(self, client, model: str, messages: List[dict]): - responses = [] - payload1 = { - "model": model, - "messages": messages, - # Increase token length to allow more room for variability - "max_tokens": 200, - "seed": 1, - } - payload2 = copy.deepcopy(payload1) - payload2["seed"] = 2 - - # First 2 responses should be the same in both vLLM and TRT-LLM with identical seed - responses.append( - client.post( - "/v1/chat/completions", - json=payload1, - ) - ) - responses.append( - client.post( - "/v1/chat/completions", - json=payload1, - ) - ) - # Third response should differ with different seed in payload - responses.append( - client.post( - "/v1/chat/completions", - json=payload2, - ) - ) - - for response in responses: - print("Response:", response.json()) - assert response.status_code == 200 - - response1_text = ( - responses[0].json()["choices"][0]["message"]["content"].strip().split() - ) - response2_text = ( - responses[1].json()["choices"][0]["message"]["content"].strip().split() - ) - response3_text = ( - responses[2].json()["choices"][0]["message"]["content"].strip().split() - ) - - assert response1_text == response2_text - assert response1_text != response3_text - - def test_chat_completions_no_message( - self, client, model: str, messages: List[dict] - ): - # Message validation requires min_length of 1 - messages = [] - response = client.post( - "/v1/chat/completions", json={"model": model, "messages": messages} - ) - assert response.status_code == 422 - assert ( - response.json()["detail"][0]["msg"] - == "List should have at least 1 item after validation, not 0" - ) - - def test_chat_completions_empty_message( - self, client, model: str, messages: List[dict] - ): - # Message validation requires min_length of 1 - messages = [{}] - response = client.post( - "/v1/chat/completions", json={"model": model, "messages": messages} - ) - assert response.status_code == 422 - assert response.json()["detail"][0]["msg"] == "Field required" - - def test_chat_completions_multiple_choices( - self, client, model: str, messages: List[dict] - ): - response = client.post( - "/v1/chat/completions", - json={"model": model, "messages": messages, "n": 2}, - ) - - assert response.status_code == 400 - assert response.json()["detail"] == "Only single choice is supported" - - @pytest.mark.skip(reason="Not Implemented Yet") - def test_chat_completions_streaming(self, client): - pass - - def test_chat_completions_no_streaming( - self, client, model: str, messages: List[dict] - ): - response = client.post( - "/v1/chat/completions", - json={"model": model, "messages": messages, "stream": False}, - ) - - assert response.status_code == 200 - message = response.json()["choices"][0]["message"] - assert message["content"].strip() - assert message["role"] == "assistant" - - @pytest.mark.skip(reason="Not Implemented Yet") - def test_function_calling(self): - pass - - @pytest.mark.skip(reason="Not Implemented Yet") - def test_lora(self): - pass - - @pytest.mark.skip(reason="Not Implemented Yet") - def test_multi_lora(self): - pass - - @pytest.mark.skip(reason="Not Implemented Yet") - def test_request_n_choices(self): - pass - - @pytest.mark.skip(reason="Not Implemented Yet") - def test_request_logprobs(self): - pass - - @pytest.mark.skip(reason="Not Implemented Yet") - def test_request_logit_bias(self): - pass - - # TODO: Do we want to support "usage" field for token counts in response? - @pytest.mark.skip(reason="Not Implemented Yet") - def test_usage_response(self): - pass - - -# For tests that won't use the same pytest fixture for server startup across -# the whole class test suite. -class TestChatCompletionsCustomFixture: - # A TOKENIZER must be known for /chat/completions endpoint in order to - # apply chat templates, and for simplicity in determination, users should - # define the TOKENIZER. So, explicitly raise an error if none is provided. - def test_chat_completions_no_tokenizer( - self, backend: str, model: str, messages: List[dict] - ): - model_repository = str(Path(__file__).parent / f"{backend}_models") - app = setup_fastapi_app(model_repository=model_repository, tokenizer="") - with TestClient(app) as client: - response = client.post( - "/v1/chat/completions", - json={"model": model, "messages": messages}, - ) - assert response.status_code == 400 - assert response.json()["detail"] == "Unknown tokenizer" diff --git a/qa/L0_openai/openai/src/tests/test_completions.py b/qa/L0_openai/openai/src/tests/test_completions.py deleted file mode 100644 index e43e225988..0000000000 --- a/qa/L0_openai/openai/src/tests/test_completions.py +++ /dev/null @@ -1,321 +0,0 @@ -import copy - -import pytest - - -class TestCompletions: - @pytest.fixture(scope="class") - def client(self, fastapi_client_class_scope): - yield fastapi_client_class_scope - - def test_completions_defaults(self, client, model: str, prompt: str): - response = client.post( - "/v1/completions", - json={"model": model, "prompt": prompt}, - ) - - print("Response:", response.json()) - assert response.status_code == 200 - # NOTE: Could be improved to look for certain quality of response, - # or tested with dummy identity model. - assert response.json()["choices"][0]["text"].strip() - # "usage" currently not supported - assert response.json()["usage"] == None - - @pytest.mark.parametrize( - "sampling_parameter, value", - [ - ("temperature", 0.7), - ("max_tokens", 10), - ("top_p", 0.9), - ("frequency_penalty", 0.5), - ("presence_penalty", 0.2), - # logprobs is an integer for completions - ("logprobs", 5), - ("logit_bias", {"0": 0}), - ], - ) - def test_completions_sampling_parameters( - self, client, sampling_parameter, value, model: str, prompt: str - ): - response = client.post( - "/v1/completions", - json={ - "model": model, - "prompt": prompt, - sampling_parameter: value, - }, - ) - print("Response:", response.json()) - - # TODO: Add support and remove this check - unsupported_parameters = ["logprobs", "logit_bias"] - if sampling_parameter in unsupported_parameters: - assert response.status_code == 400 - assert response.json()["detail"] == "logit bias and log probs not supported" - return - - assert response.status_code == 200 - assert response.json()["choices"][0]["text"].strip() - - # Simple tests to verify max_tokens roughly behaves as expected - def test_completions_max_tokens(self, client, model: str, prompt: str): - responses = [] - payload = {"model": model, "prompt": prompt, "max_tokens": 1} - - # Send two requests with max_tokens = 1 to check their similarity - payload["max_tokens"] = 1 - responses.append( - client.post( - "/v1/completions", - json=payload, - ) - ) - responses.append( - client.post( - "/v1/completions", - json=payload, - ) - ) - # Send one requests with larger max_tokens to check its dis-similarity - payload["max_tokens"] = 100 - responses.append( - client.post( - "/v1/completions", - json=payload, - ) - ) - - for response in responses: - print("Response:", response.json()) - assert response.status_code == 200 - - response1_text = responses[0].json()["choices"][0]["text"].strip().split() - response2_text = responses[1].json()["choices"][0]["text"].strip().split() - response3_text = responses[2].json()["choices"][0]["text"].strip().split() - # Simplification: One token shouldn't be more than one space-delimited word - assert len(response1_text) == len(response2_text) == 1 - assert len(response3_text) > len(response1_text) - - @pytest.mark.parametrize( - "temperature", - [0.0, 1.0], - ) - # Simple tests to verify temperature roughly behaves as expected - def test_completions_temperature_vllm( - self, client, temperature, backend: str, model: str, prompt: str - ): - if backend != "vllm": - pytest.skip(reason="Only used to test vLLM-specific temperature behavior") - - responses = [] - payload = { - "model": model, - "prompt": prompt, - "temperature": temperature, - } - - responses.append( - client.post( - "/v1/completions", - json=payload, - ) - ) - responses.append( - client.post( - "/v1/completions", - json=payload, - ) - ) - - for response in responses: - print("Response:", response.json()) - assert response.status_code == 200 - - response1_text = responses[0].json()["choices"][0]["text"].strip().split() - response2_text = responses[1].json()["choices"][0]["text"].strip().split() - - # Temperature of 0.0 indicates greedy sampling, so check - # that two equivalent requests produce the same response. - if temperature == 0.0: - # NOTE: This check may be ambitious to get an exact match in all - # frameworks depending on how other parameter defaults are set, so - # it can probably be removed if it introduces flakiness. - print(f"Comparing '{response1_text}' == '{response2_text}'") - assert response1_text == response2_text - # Temperature of 1.0 indicates maximum randomness, so check - # that two equivalent requests produce different responses. - elif temperature == 1.0: - print(f"Comparing '{response1_text}' != '{response2_text}'") - assert response1_text != response2_text - # Don't bother checking values other than the extremes - else: - raise ValueError(f"Unexpected {temperature=} for this test.") - - # Remove xfail when fix is released and this test returns xpass status - @pytest.mark.xfail( - reason="TRT-LLM BLS model will ignore temperature until a later release" - ) - # Simple tests to verify temperature roughly behaves as expected - def test_completions_temperature_tensorrtllm( - self, client, backend: str, model: str, prompt: str - ): - if backend != "tensorrtllm": - pytest.skip(reason="Only used to test vLLM-specific temperature behavior") - - responses = [] - payload1 = { - "model": model, - "prompt": prompt, - "temperature": 0.0, - # TRT-LLM requires certain settings of `top_k` / `top_p` to - # respect changes in `temperature` - "top_p": 0.5, - } - payload2 = copy.deepcopy(payload1) - payload2["temperature"] = 1.0 - - # First 2 responses should be the same in TRT-LLM with identical payload - responses.append( - client.post( - "/v1/completions", - json=payload1, - ) - ) - responses.append( - client.post( - "/v1/completions", - json=payload1, - ) - ) - # Third response should differ with different temperature in payload - responses.append( - client.post( - "/v1/completions", - json=payload2, - ) - ) - - for response in responses: - print("Response:", response.json()) - assert response.status_code == 200 - - response1_text = responses[0].json()["choices"][0]["text"].strip().split() - response2_text = responses[1].json()["choices"][0]["text"].strip().split() - response3_text = responses[2].json()["choices"][0]["text"].strip().split() - - assert response1_text == response2_text - assert response1_text != response3_text - - # Simple tests to verify seed roughly behaves as expected - def test_completions_seed(self, client, model: str, prompt: str): - responses = [] - payload1 = {"model": model, "prompt": prompt, "seed": 1} - payload2 = copy.deepcopy(payload1) - payload2["seed"] = 2 - - # First 2 responses should be the same in TRT-LLM with identical payload - responses.append( - client.post( - "/v1/completions", - json=payload1, - ) - ) - responses.append( - client.post( - "/v1/completions", - json=payload1, - ) - ) - # Third response should differ with different temperature in payload - responses.append( - client.post( - "/v1/completions", - json=payload2, - ) - ) - - for response in responses: - print("Response:", response.json()) - assert response.status_code == 200 - - response1_text = responses[0].json()["choices"][0]["text"].strip().split() - response2_text = responses[1].json()["choices"][0]["text"].strip().split() - response3_text = responses[2].json()["choices"][0]["text"].strip().split() - - assert response1_text == response2_text - assert response1_text != response3_text - - @pytest.mark.parametrize( - "sampling_parameter, value", - [ - ("temperature", 2.1), - ("temperature", -0.1), - ("max_tokens", -1), - ("top_p", 1.1), - ("frequency_penalty", 3), - ("frequency_penalty", -3), - ("presence_penalty", 2.1), - ("presence_penalty", -2.1), - ], - ) - def test_completions_invalid_sampling_parameters( - self, client, sampling_parameter, value, model: str, prompt: str - ): - response = client.post( - "/v1/completions", - json={ - "model": model, - "prompt": prompt, - sampling_parameter: value, - }, - ) - - print("Response:", response.json()) - assert response.status_code == 422 - - def test_completions_empty_request(self, client): - response = client.post("/v1/completions", json={}) - assert response.status_code == 422 - - def test_completions_no_model(self, client, prompt: str): - response = client.post("/v1/completions", json={"prompt": prompt}) - assert response.status_code == 422 - - def test_completions_no_prompt(self, client, model: str): - response = client.post("/v1/completions", json={"model": model}) - assert response.status_code == 422 - - def test_completions_empty_prompt(self, client, model: str): - response = client.post("/v1/completions", json={"model": model, "prompt": ""}) - - # NOTE: Should this be validated in schema instead? - # 400 Error returned in route handler - assert response.status_code == 400 - - def test_no_prompt(self, client, model: str): - response = client.post("/v1/completions", json={"model": model}) - - # 422 Error returned by schema validation - assert response.status_code == 422 - - def test_completions_multiple_choices(self, client, model: str, prompt: str): - response = client.post( - "/v1/completions", json={"model": model, "prompt": prompt, "n": 2} - ) - - assert response.status_code == 400 - assert response.json()["detail"] == "Only single choice is supported" - - @pytest.mark.skip(reason="Not Implemented Yet") - def test_lora(self): - pass - - @pytest.mark.skip(reason="Not Implemented Yet") - def test_multi_lora(self): - pass - - # TODO: Do we want to support "usage" field for token counts in response? - @pytest.mark.skip(reason="Not Implemented Yet") - def test_usage_response(self): - pass diff --git a/qa/L0_openai/openai/src/tests/test_models/mock_llm/1/model.py b/qa/L0_openai/openai/src/tests/test_models/mock_llm/1/model.py deleted file mode 100644 index 1cf5f3613c..0000000000 --- a/qa/L0_openai/openai/src/tests/test_models/mock_llm/1/model.py +++ /dev/null @@ -1,108 +0,0 @@ -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import json -import time - -import numpy as np -import triton_python_backend_utils as pb_utils - - -class TritonPythonModel: - def initialize(self, args): - self.model_config = json.loads(args["model_config"]) - self.decoupled = self.model_config.get("model_transaction_policy", {}).get( - "decoupled" - ) - - def execute(self, requests): - if self.decoupled: - return self.exec_decoupled(requests) - else: - return self.exec(requests) - - def exec(self, requests): - responses = [] - for request in requests: - params = json.loads(request.parameters()) - rep_count = params["REPETITION"] if "REPETITION" in params else 1 - - input_np = pb_utils.get_input_tensor_by_name( - request, "text_intpu" - ).as_numpy() - stream_np = pb_utils.get_input_tensor_by_name(request, "stream").as_numpy() - stream = stream_np.flatten()[0] - if stream: - responses.append( - pb_utils.InferenceResponse( - error=pb_utils.TritonError( - "STREAM only supported in decoupled mode" - ) - ) - ) - else: - out_tensor = pb_utils.Tensor( - "text_output", np.repeat(input_np, rep_count, axis=1) - ) - responses.append(pb_utils.InferenceResponse([out_tensor])) - return responses - - def exec_decoupled(self, requests): - for request in requests: - params = json.loads(request.parameters()) - rep_count = params["REPETITION"] if "REPETITION" in params else 1 - fail_last = params["FAIL_LAST"] if "FAIL_LAST" in params else False - delay = params["DELAY"] if "DELAY" in params else None - - sender = request.get_response_sender() - input_np = pb_utils.get_input_tensor_by_name( - request, "text_input" - ).as_numpy() - stream_np = pb_utils.get_input_tensor_by_name(request, "stream").as_numpy() - out_tensor = pb_utils.Tensor("text_output", input_np) - response = pb_utils.InferenceResponse([out_tensor]) - # If stream enabled, just send multiple copies of response - # FIXME: Could split up response string into tokens, but this is simpler for now. - stream = stream_np.flatten()[0] - if stream: - for _ in range(rep_count): - if delay is not None: - time.sleep(delay) - sender.send(response) - sender.send( - None - if not fail_last - else pb_utils.InferenceResponse( - error=pb_utils.TritonError("An Error Occurred") - ), - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL, - ) - # If stream disabled, just send one response - else: - sender.send( - response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL - ) - return None diff --git a/qa/L0_openai/openai/src/tests/test_models/mock_llm/config.pbtxt b/qa/L0_openai/openai/src/tests/test_models/mock_llm/config.pbtxt deleted file mode 100644 index 5f665ff543..0000000000 --- a/qa/L0_openai/openai/src/tests/test_models/mock_llm/config.pbtxt +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -backend: "python" - -max_batch_size: 0 - -model_transaction_policy { - decoupled: True -} - -input [ - { - name: "text_input" - data_type: TYPE_STRING - dims: [ 1, 1 ] - }, - { - name: "stream" - data_type: TYPE_BOOL - dims: [ 1, 1 ] - } -] - -output [ - { - name: "text_output" - data_type: TYPE_STRING - dims: [ 1, -1 ] - } -] - -instance_group [ - { - count: 1 - kind: KIND_MODEL - } -] diff --git a/qa/L0_openai/openai/src/tests/test_observability.py b/qa/L0_openai/openai/src/tests/test_observability.py deleted file mode 100644 index eca88a03de..0000000000 --- a/qa/L0_openai/openai/src/tests/test_observability.py +++ /dev/null @@ -1,71 +0,0 @@ -import os -from pathlib import Path - -import pytest -from fastapi.testclient import TestClient -from src.api_server import init_app - - -# Override conftest.py default model -@pytest.fixture -def model(): - return "mock_llm" - - -class TestObservability: - @pytest.fixture(scope="class") - def client(self): - model_repository = Path(__file__).parent / "test_models" - os.environ["TRITON_MODEL_REPOSITORY"] = str(model_repository) - app = init_app() - with TestClient(app) as test_client: - yield test_client - - ### General Error Handling ### - def test_not_found(self, client): - response = client.get("/does-not-exist") - assert response.status_code == 404 - - ### Startup / Health ### - def test_startup_success(self, client): - response = client.get("/health") - assert response.status_code == 200 - - def test_startup_fail(self): - os.environ["TRITON_MODEL_REPOSITORY"] = "/does/not/exist" - with pytest.raises(Exception): - # Test that FastAPI lifespan startup fails when initializing Triton - # with unknown model repository. - app = init_app() - with TestClient(app): - pass - - ### Metrics ### - def test_startup_metrics(self, client): - response = client.get("/metrics") - assert response.status_code == 200 - # FIXME: Flesh out more - # NOTE: response.json() works even on non-json prometheus data - assert "nv_cpu_utilization" in response.json() - - ### Models ### - def test_models_list(self, client, model): - # TODO: Load multiple models and make sure exactly ALL are returned - response = client.get("/v1/models") - assert response.status_code == 200 - models = response.json()["data"] - assert len(models) == 1 - assert models[0]["id"] == model - assert models[0]["object"] == "model" - assert models[0]["created"] > 0 - assert models[0]["owned_by"] == "Triton Inference Server" - - def test_models_get(self, client, model): - # TODO: Load multiple models and make sure exactly 1 is returned - response = client.get(f"/v1/models/{model}") - assert response.status_code == 200 - model_resp = response.json() - assert model_resp["id"] == model - assert model_resp["object"] == "model" - assert model_resp["created"] > 0 - assert model_resp["owned_by"] == "Triton Inference Server" diff --git a/qa/L0_openai/openai/src/tests/test_openai_client.py b/qa/L0_openai/openai/src/tests/test_openai_client.py deleted file mode 100644 index 6c61403e73..0000000000 --- a/qa/L0_openai/openai/src/tests/test_openai_client.py +++ /dev/null @@ -1,163 +0,0 @@ -from typing import List - -import openai -import pytest - - -class TestOpenAIClient: - @pytest.fixture(scope="class") - def client(self, server): - return server.get_client() - - def test_openai_client_models(self, client: openai.OpenAI, backend: str): - models = list(client.models.list()) - print(f"Models: {models}") - if backend == "tensorrtllm": - # ensemble or tensorrt_llm_bls - # preprocess -> tensorrt_llm -> postprocess - assert len(models) == 5 - elif backend == "vllm": - assert len(models) == 1 - else: - raise Exception(f"Unexpected backend {backend=}") - - def test_openai_client_completion( - self, client: openai.OpenAI, model: str, prompt: str - ): - completion = client.completions.create( - prompt=prompt, - model=model, - ) - - print(f"Completion results: {completion}") - assert completion.choices[0].text - assert completion.choices[0].finish_reason == "stop" - - def test_openai_client_chat_completion( - self, client: openai.OpenAI, model: str, messages: List[dict] - ): - chat_completion = client.chat.completions.create( - messages=messages, - model=model, - ) - - print(f"Chat completion results: {chat_completion}") - assert chat_completion.choices[0].message.content - assert chat_completion.choices[0].finish_reason == "stop" - - @pytest.mark.parametrize("echo", [False, True]) - def test_openai_client_completion_echo( - self, client: openai.OpenAI, echo: bool, backend: str, model: str, prompt: str - ): - if backend == "tensorrtllm": - pytest.skip( - reason="TRT-LLM backend currently only supports setting this parameter at model load time", - ) - - completion = client.completions.create(prompt=prompt, model=model, echo=echo) - - print(f"Completion results: {completion}") - response = completion.choices[0].text - if echo: - assert prompt in response - else: - assert prompt not in response - - @pytest.mark.skip(reason="Not Implemented Yet") - def test_openai_client_function_calling(self): - pass - - -class TestAsyncOpenAIClient: - @pytest.fixture(scope="class") - def client(self, server): - return server.get_async_client() - - @pytest.mark.asyncio - async def test_openai_client_models(self, client: openai.AsyncOpenAI, backend: str): - async_models = await client.models.list() - models = [model async for model in async_models] - print(f"Models: {models}") - if backend == "tensorrtllm": - # ensemble or tensorrt_llm_bls - # preprocess -> tensorrt_llm -> postprocess - assert len(models) == 5 - elif backend == "vllm": - assert len(models) == 1 - else: - raise Exception(f"Unexpected backend {backend=}") - - @pytest.mark.asyncio - async def test_openai_client_completion( - self, client: openai.AsyncOpenAI, model: str, prompt: str - ): - completion = await client.completions.create( - prompt=prompt, - model=model, - ) - - print(f"Completion results: {completion}") - assert completion.choices[0].text - assert completion.choices[0].finish_reason == "stop" - - @pytest.mark.asyncio - async def test_openai_client_chat_completion( - self, client: openai.AsyncOpenAI, model: str, messages: List[dict] - ): - chat_completion = await client.chat.completions.create( - messages=messages, - model=model, - ) - - assert chat_completion.choices[0].message.content - assert chat_completion.choices[0].finish_reason == "stop" - print(f"Chat completion results: {chat_completion}") - - # TODO: Add this test - @pytest.mark.skip(reason="Not Implemented Yet") - @pytest.mark.asyncio - async def test_completion_streaming(self): - pass - - @pytest.mark.asyncio - async def test_chat_streaming( - self, client: openai.AsyncOpenAI, model: str, messages: List[dict] - ): - # test single completion - chat_completion = await client.chat.completions.create( - model=model, - messages=messages, - max_tokens=10, - temperature=0.0, - stream=False, - ) - output = chat_completion.choices[0].message.content - stop_reason = chat_completion.choices[0].finish_reason - - # test streaming - stream = await client.chat.completions.create( - model=model, - messages=messages, - max_tokens=10, - temperature=0.0, - stream=True, - ) - chunks = [] - finish_reason_count = 0 - async for chunk in stream: - delta = chunk.choices[0].delta - if delta.role: - assert delta.role == "assistant" - if delta.content: - chunks.append(delta.content) - if chunk.choices[0].finish_reason is not None: - finish_reason_count += 1 - # finish reason should only return in last block - assert finish_reason_count == 1 - assert chunk.choices[0].finish_reason == stop_reason - assert "".join(chunks) == output - - @pytest.mark.skip(reason="Not Implemented Yet") - @pytest.mark.asyncio - async def test_openai_client_function_calling(self): - pass diff --git a/qa/L0_openai/openai/src/tests/utils.py b/qa/L0_openai/openai/src/tests/utils.py deleted file mode 100644 index d03368663a..0000000000 --- a/qa/L0_openai/openai/src/tests/utils.py +++ /dev/null @@ -1,93 +0,0 @@ -import os -import subprocess -import sys -import time -from pathlib import Path -from typing import Dict, List, Optional - -import openai -import requests -from src.api_server import init_app - - -def setup_fastapi_app(tokenizer: str, model_repository: str): - os.environ["TOKENIZER"] = tokenizer - os.environ["TRITON_MODEL_REPOSITORY"] = model_repository - app = init_app() - return app - - -# Heavily inspired by vLLM's test infrastructure -class OpenAIServer: - API_KEY = "EMPTY" # Triton's OpenAI server does not need API key - START_TIMEOUT = 120 # wait for server to start for up to 120 seconds - - def __init__( - self, - cli_args: List[str], - *, - env_dict: Optional[Dict[str, str]] = None, - ) -> None: - self.host = "localhost" - self.port = 8000 - - env = os.environ.copy() - if env_dict is not None: - env.update(env_dict) - - this_dir = Path(__file__).resolve().parent - script_path = this_dir / ".." / ".." / "main.py" - self.proc = subprocess.Popen( - ["python3", script_path] + cli_args, - env=env, - stdout=sys.stdout, - stderr=sys.stderr, - ) - # Wait until health endpoint is responsive - self._wait_for_server(url=self.url_for("health"), timeout=self.START_TIMEOUT) - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - self.proc.terminate() - try: - wait_secs = 30 - self.proc.wait(wait_secs) - except subprocess.TimeoutExpired: - # force kill if needed - self.proc.kill() - - def _wait_for_server(self, *, url: str, timeout: float): - start = time.time() - while True: - try: - if requests.get(url).status_code == 200: - break - except Exception as err: - result = self.proc.poll() - if result is not None and result != 0: - raise RuntimeError("Server exited unexpectedly.") from err - - time.sleep(0.5) - if time.time() - start > timeout: - raise RuntimeError("Server failed to start in time.") from err - - @property - def url_root(self) -> str: - return f"http://{self.host}:{self.port}" - - def url_for(self, *parts: str) -> str: - return self.url_root + "/" + "/".join(parts) - - def get_client(self): - return openai.OpenAI( - base_url=self.url_for("v1"), - api_key=self.API_KEY, - ) - - def get_async_client(self): - return openai.AsyncOpenAI( - base_url=self.url_for("v1"), - api_key=self.API_KEY, - ) diff --git a/qa/L0_openai/openai/src/tests/vllm_models/llama-3.1-8b-instruct/1/model.json b/qa/L0_openai/openai/src/tests/vllm_models/llama-3.1-8b-instruct/1/model.json deleted file mode 100644 index 00f18b88bd..0000000000 --- a/qa/L0_openai/openai/src/tests/vllm_models/llama-3.1-8b-instruct/1/model.json +++ /dev/null @@ -1 +0,0 @@ -{"model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "disable_log_requests": true, "gpu_memory_utilization": 0.85} \ No newline at end of file diff --git a/qa/L0_openai/openai/src/tests/vllm_models/llama-3.1-8b-instruct/config.pbtxt b/qa/L0_openai/openai/src/tests/vllm_models/llama-3.1-8b-instruct/config.pbtxt deleted file mode 100644 index 4ad6534943..0000000000 --- a/qa/L0_openai/openai/src/tests/vllm_models/llama-3.1-8b-instruct/config.pbtxt +++ /dev/null @@ -1,2 +0,0 @@ -backend: "vllm" -instance_group [{kind: KIND_MODEL}] \ No newline at end of file diff --git a/qa/L0_openai/openai/src/utils/__init__.py b/qa/L0_openai/openai/src/utils/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/qa/L0_openai/openai/src/utils/tokenizer.py b/qa/L0_openai/openai/src/utils/tokenizer.py deleted file mode 100644 index a60783a5f9..0000000000 --- a/qa/L0_openai/openai/src/utils/tokenizer.py +++ /dev/null @@ -1,77 +0,0 @@ -from typing import Optional, Union - -from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast - - -def get_cached_tokenizer( - tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast] -) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: - """Get tokenizer with cached properties. - - This will patch the tokenizer object in place. - - By default, transformers will recompute multiple tokenizer properties - each time they are called, leading to a significant slowdown. This - function caches these properties for faster access.""" - - tokenizer_all_special_ids = set(tokenizer.all_special_ids) - tokenizer_all_special_tokens_extended = tokenizer.all_special_tokens_extended - tokenizer_all_special_tokens = set(tokenizer.all_special_tokens) - tokenizer_len = len(tokenizer) - - class CachedTokenizer(tokenizer.__class__): # type: ignore - @property - def all_special_ids(self): - return tokenizer_all_special_ids - - @property - def all_special_tokens(self): - return tokenizer_all_special_tokens - - @property - def all_special_tokens_extended(self): - return tokenizer_all_special_tokens_extended - - def __len__(self): - return tokenizer_len - - CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}" - - tokenizer.__class__ = CachedTokenizer - return tokenizer - - -def get_tokenizer( - tokenizer_name: str, - *args, - tokenizer_mode: str = "auto", - trust_remote_code: bool = False, - tokenizer_revision: Optional[str] = None, - download_dir: Optional[str] = None, - **kwargs, -) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: - """Gets a tokenizer for the given model name via Huggingface/modelscope.""" - if tokenizer_mode == "slow": - if kwargs.get("use_fast", False): - raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.") - kwargs["use_fast"] = False - - try: - tokenizer = AutoTokenizer.from_pretrained( - tokenizer_name, - *args, - trust_remote_code=trust_remote_code, - tokenizer_revision=tokenizer_revision, - **kwargs, - ) - except ValueError as e: - raise e - except AttributeError as e: - raise e - - if not isinstance(tokenizer, PreTrainedTokenizerFast): - print( - "Using a slow tokenizer. This might cause a significant " - "slowdown. Consider using a fast tokenizer instead." - ) - return get_cached_tokenizer(tokenizer) diff --git a/qa/L0_openai/openai/src/utils/triton.py b/qa/L0_openai/openai/src/utils/triton.py deleted file mode 100644 index 42a92fa34d..0000000000 --- a/qa/L0_openai/openai/src/utils/triton.py +++ /dev/null @@ -1,219 +0,0 @@ -import os -import time -import typing -from dataclasses import dataclass - -import numpy as np -import tritonserver -from fastapi import HTTPException -from src.schemas.openai import CreateChatCompletionRequest, CreateCompletionRequest -from src.utils.tokenizer import get_tokenizer - -# TODO: Refactor -# NOTE: Allow python backend for testing purposes -SUPPORTED_BACKENDS: set = {"vllm", "tensorrtllm", "python"} -LLM_BACKENDS: set = {"vllm", "tensorrtllm"} - - -# TODO: pydantic validation? -@dataclass -class TritonModelMetadata: - # Name used in Triton model repository - name: str - # Name of backend used by Triton - backend: str - # Triton model object handle - model: tritonserver.Model - # TODO: Address typing - tokenizer: typing.Optional[typing.Any] - # Time that model was loaded by Triton - create_time: int - # TODO: Address typing - request_convert_fn: typing.Optional[typing.Any] - - -def determine_request_format(backend): - # Request conversion from OpenAI format to backend-specific format - if backend == "vllm": - request_convert_fn = create_vllm_inference_request - # Python included to support TRT-LLM BLS model and TRT-LLM python runtime - elif backend in ["tensorrtllm", "python"]: - request_convert_fn = create_trtllm_inference_request - else: - request_convert_fn = None - - return request_convert_fn - - -# TODO: Refactor: -# NOTE: We need to figure out a few things while looking at the models in the -# triton model repository. -# 1. Which model should we interact with when sending requests to Triton core? -# a. For a single model, this is trivial, and would support any backend. -# b. For TRT-LLM, this should be 'ensemble' or 'tensorrt_llm_bls' following -# TRT-LLM defaults/examples. However, this could also be renamed by the user -# to have a more intuitive front-facing name, such as "llama3-8b". Note that -# TRT-LLM pipelines produced by the Triton CLI will generally be renamed like -# this. FIXME: This is a relatively fragile flow and should be improved. -# 2. Which tokenizer to use for things like applying a chat template or making -# a tool/function call. These are primarily relevant for the /chat/completions -# endpoint, but not the /completions endpoint. -# - For now, require user-defined TOKENIZER for simplicity. -# 3. Which inputs/outputs/parameters should be set when creating the underlying -# triton inference request? The inference request fields required will differ -# for vLLM, TRT-LLM, and user-defined models like a custom python model. So we -# need to know how to correctly translate the OpenAI schema parameters to -# a triton inference request. -# - For now, we will look for either vllm or trtllm in list of loaded backends, -# and we consider python==trtllm for now due to possibility of python runtime. -# We may want to consider using Triton's "runtime" config field for this for -# easier detection instead. -def load_models(server): - model_metadatas = [] - backends = [] - - # TODO: Support tokenizers more generically or custom tokenizers, possibly - # by looking for tokenizer.json in a pre-specified location? - tokenizer = None - tokenizer_model = os.environ.get("TOKENIZER") - if tokenizer_model: - print(f"Using env var TOKENIZER={tokenizer_model} to determine the tokenizer") - tokenizer = get_tokenizer(tokenizer_model) - - models = [] - backends = [] - names = [] - # Load all triton models and gather the respective backends of each - for name, version in server.models().keys(): - # TODO: Why skip known version? Already loaded? - if version != -1: - continue - - model = server.load(name) - backend = model.config()["backend"] - - names.append(name) - models.append(model) - backends.append(backend) - print(f"Loaded: {name=}, {backend=}, tokenizer={tokenizer_model}") - - create_time = int(time.time()) - - # One tokenizer, convert function, and creation time for all loaded models. - # NOTE: This doesn't currently support having both a vLLM and TRT-LLM - # model loaded at the same time. - for name, model, backend in zip(names, models, backends): - metadata = TritonModelMetadata( - name=name, - backend=backend, - model=model, - tokenizer=tokenizer, - create_time=create_time, - request_convert_fn=determine_request_format(backend), - ) - model_metadatas.append(metadata) - - return model_metadatas - - -def init_tritonserver(): - model_repository = os.environ.get( - "TRITON_MODEL_REPOSITORY", "/opt/tritonserver/models" - ) - log_verbose_level = int(os.environ.get("TRITON_LOG_VERBOSE_LEVEL", "0")) - - print("Starting Triton Server Core...") - server = tritonserver.Server( - model_repository=model_repository, - log_verbose=log_verbose_level, - log_info=True, - log_warn=True, - log_error=True, - model_control_mode=tritonserver.ModelControlMode.EXPLICIT, - ).start(wait_until_ready=True) - - print("Loading Models...") - metadatas = load_models(server) - return server, metadatas - - -def get_output(response): - if "text_output" in response.outputs: - try: - return response.outputs["text_output"].to_string_array()[0] - except: - return str(response.outputs["text_output"].to_bytes_array()[0]) - return "" - - -def validate_triton_responses(responses): - num_responses = len(responses) - if num_responses == 1 and responses[0].final != True: - raise HTTPException( - status_code=400, - detail="Unexpected internal error with incorrect response flags", - ) - if num_responses == 2 and responses[-1].final != True: - raise HTTPException( - status_code=400, - detail="Unexpected internal error with incorrect response flags", - ) - if num_responses > 2: - raise HTTPException( - status_code=400, - detail=f"Unexpected number of responses: {num_responses}, expected 1.", - ) - - -def create_vllm_inference_request( - model, prompt, request: CreateChatCompletionRequest | CreateCompletionRequest -): - inputs = {} - excludes = {"model", "stream", "messages", "prompt", "echo"} - - # NOTE: The exclude_none is important, as internals may not support - # values of NoneType at this time. - sampling_parameters = request.model_dump( - exclude=excludes, - exclude_none=True, - ) - print(f"[DEBUG] {sampling_parameters=}") - - inputs["text_input"] = [prompt] - inputs["stream"] = [request.stream] - exclude_input_in_output = True - echo = getattr(request, "echo", None) - if echo: - exclude_input_in_output = not echo - inputs["exclude_input_in_output"] = [exclude_input_in_output] - - print(f"[DEBUG] Triton Inference Request {inputs=}") - return model.create_request(inputs=inputs, parameters=sampling_parameters) - - -def create_trtllm_inference_request( - model, prompt, request: CreateChatCompletionRequest | CreateCompletionRequest -): - inputs = {} - inputs["text_input"] = [[prompt]] - inputs["stream"] = [[request.stream]] - if request.max_tokens: - inputs["max_tokens"] = np.int32([[request.max_tokens]]) - if request.stop: - if isinstance(request.stop, str): - request.stop = [request.stop] - inputs["stop_words"] = [request.stop] - # Check "is not None" specifically, because values of zero are valid. - if request.top_p is not None: - inputs["top_p"] = np.float32([[request.top_p]]) - if request.frequency_penalty is not None: - inputs["frequency_penalty"] = np.float32([[request.frequency_penalty]]) - if request.presence_penalty is not None: - inputs["presence_penalty"] = np.float32([[request.presence_penalty]]) - if request.seed is not None: - inputs["random_seed"] = np.uint64([[request.seed]]) - if request.temperature is not None: - inputs["temperature"] = np.float32([[request.temperature]]) - - print(f"[DEBUG] Triton Inference Request {inputs=}") - return model.create_request(inputs=inputs) From d35d336c9439ec5c768860539a10199543b4703f Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Mon, 19 Aug 2024 16:06:44 -0700 Subject: [PATCH 28/80] Add disclaimer for TRT-LLM to README --- python/openai/README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/openai/README.md b/python/openai/README.md index c8baf0a8e8..99c730a403 100644 --- a/python/openai/README.md +++ b/python/openai/README.md @@ -103,6 +103,12 @@ pytest -v ## TensorRT-LLM +**NOTE**: The workflow for preparing TRT-LLM engines, model repository, etc. in order to +load and test is not fleshed out in the README here yet. You can try using the Triton CLI +or follow existing TRT-LLM backend examples to prepare a model repository, and point +at the model repository accordingly when following the examples. This should be fleshed out + or cleaned up in the future. + 0. `[TODO]` Prepare your model repository for a TensorRT-LLM model, build the engine, etc. 1. Build and launch the container: From 63fc4a7e83272c7b370ae9ef20194947483eb962 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Mon, 19 Aug 2024 16:10:21 -0700 Subject: [PATCH 29/80] Fix README typos --- python/openai/README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/openai/README.md b/python/openai/README.md index 99c730a403..13a5e7a73b 100644 --- a/python/openai/README.md +++ b/python/openai/README.md @@ -18,6 +18,7 @@ ```bash docker build -t tritonserver-openai-vllm -f docker/Dockerfile.vllm . + docker run -it --net=host --gpus all --rm \ -v ${PWD}:/workspace \ -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \ @@ -94,7 +95,7 @@ print(completion.choices[0].message.content) ``` 7. Run tests (NOTE: The server should not be running, the tests will handle starting/stopping the server as necessary): -``` +```bash cd openai/tests/ pytest -v ``` @@ -118,7 +119,8 @@ at the model repository accordingly when following the examples. This should be access gated models, make sure this is set in your local environment if needed. ```bash -docker build -t tritonserver-openai-vllm -f docker/Dockerfile.tensorttllm . +docker build -t tritonserver-openai-vllm -f docker/Dockerfile.tensorrtllm . + docker run -it --net=host --gpus all --rm \ -v ${PWD}:/workspace \ -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \ From 4a729c0e1beb8bd4978d93da48d6a23ccd5371a6 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Mon, 19 Aug 2024 16:14:29 -0700 Subject: [PATCH 30/80] Fix relative path for OpenAI server helper after moving locations --- python/openai/openai/tests/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/openai/openai/tests/utils.py b/python/openai/openai/tests/utils.py index d03368663a..49136cfd5a 100644 --- a/python/openai/openai/tests/utils.py +++ b/python/openai/openai/tests/utils.py @@ -36,7 +36,7 @@ def __init__( env.update(env_dict) this_dir = Path(__file__).resolve().parent - script_path = this_dir / ".." / ".." / "main.py" + script_path = this_dir / ".." / "main.py" self.proc = subprocess.Popen( ["python3", script_path] + cli_args, env=env, From 0f459b11c4329062603e94865770ec7da8f97bde Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Mon, 19 Aug 2024 16:24:48 -0700 Subject: [PATCH 31/80] Add placeholder L0_openai test folder back --- qa/L0_openai/test.sh | 3 +++ 1 file changed, 3 insertions(+) create mode 100755 qa/L0_openai/test.sh diff --git a/qa/L0_openai/test.sh b/qa/L0_openai/test.sh new file mode 100755 index 0000000000..834d18691e --- /dev/null +++ b/qa/L0_openai/test.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +echo "This is a placeholder" From 0b3def0af8656595a13c0be6c36e227c012d598e Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Mon, 19 Aug 2024 20:41:27 -0700 Subject: [PATCH 32/80] Add transformers upgrade for Llama3.1 in vllm --- python/openai/docker/Dockerfile.vllm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/openai/docker/Dockerfile.vllm b/python/openai/docker/Dockerfile.vllm index dbb8a5f63d..fcfa872539 100644 --- a/python/openai/docker/Dockerfile.vllm +++ b/python/openai/docker/Dockerfile.vllm @@ -3,4 +3,4 @@ FROM ${BASE_IMAGE} RUN pip install /opt/tritonserver/python/*.whl # NOTE: Newer vllm version upgrade to support Llama3.1 in 24.07 container. # This should be unnecessary in 24.08 container. -RUN pip install "fastapi==0.111.1" "pytest==8.1.1" "openai==1.40.6" "pytest-asyncio==0.23.8" "vllm==0.5.3.post1" +RUN pip install "fastapi==0.111.1" "pytest==8.1.1" "openai==1.40.6" "pytest-asyncio==0.23.8" "vllm==0.5.3.post1" "transformers==4.43.1" From 2e897b9b1e4d5587af4eb2445f3a8f6975d3a7f6 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Tue, 20 Aug 2024 16:23:29 -0700 Subject: [PATCH 33/80] Add requirements.txt files for use in testing --- python/openai/docker/Dockerfile.tensorrtllm | 5 ++++- python/openai/docker/Dockerfile.vllm | 8 +++++--- python/openai/docker/requirements.txt | 7 +++++++ python/openai/docker/requirements_vllm.txt | 3 +++ 4 files changed, 19 insertions(+), 4 deletions(-) create mode 100644 python/openai/docker/requirements.txt create mode 100644 python/openai/docker/requirements_vllm.txt diff --git a/python/openai/docker/Dockerfile.tensorrtllm b/python/openai/docker/Dockerfile.tensorrtllm index 1128cc4355..922c38a18f 100644 --- a/python/openai/docker/Dockerfile.tensorrtllm +++ b/python/openai/docker/Dockerfile.tensorrtllm @@ -1,4 +1,7 @@ ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 FROM ${BASE_IMAGE} + RUN pip install /opt/tritonserver/python/*.whl -RUN pip install "fastapi==0.111.1" "pytest==8.1.1" "openai==1.40.6" "pytest-asyncio==0.23.8" + +COPY requirements.txt /tmp +RUN pip install -r /tmp/requirements.txt diff --git a/python/openai/docker/Dockerfile.vllm b/python/openai/docker/Dockerfile.vllm index fcfa872539..15c6c7a122 100644 --- a/python/openai/docker/Dockerfile.vllm +++ b/python/openai/docker/Dockerfile.vllm @@ -1,6 +1,8 @@ ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.07-vllm-python-py3 FROM ${BASE_IMAGE} + RUN pip install /opt/tritonserver/python/*.whl -# NOTE: Newer vllm version upgrade to support Llama3.1 in 24.07 container. -# This should be unnecessary in 24.08 container. -RUN pip install "fastapi==0.111.1" "pytest==8.1.1" "openai==1.40.6" "pytest-asyncio==0.23.8" "vllm==0.5.3.post1" "transformers==4.43.1" + +COPY requirements.txt requirements_vllm.txt /tmp +RUN pip install -r /tmp/requirements.txt && \ + pip install -r /tmp/requirements_vllm.txt diff --git a/python/openai/docker/requirements.txt b/python/openai/docker/requirements.txt new file mode 100644 index 0000000000..24849af73a --- /dev/null +++ b/python/openai/docker/requirements.txt @@ -0,0 +1,7 @@ +# FastAPI Application +fastapi==0.111.1 +openai==1.40.6 + +# Testing +pytest==8.1.1 +pytest-asyncio==0.23.8 diff --git a/python/openai/docker/requirements_vllm.txt b/python/openai/docker/requirements_vllm.txt new file mode 100644 index 0000000000..32bb38b789 --- /dev/null +++ b/python/openai/docker/requirements_vllm.txt @@ -0,0 +1,3 @@ +transformers==4.43.1 +# Llama3.1 vllm requirements +vllm==0.5.3.post1 From f54a4faf197240e089f59277546bab217b8d106f Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Tue, 20 Aug 2024 16:25:21 -0700 Subject: [PATCH 34/80] Add placeholder test script --- qa/L0_openai/test.sh | 41 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/qa/L0_openai/test.sh b/qa/L0_openai/test.sh index 834d18691e..5f34aea1e8 100755 --- a/qa/L0_openai/test.sh +++ b/qa/L0_openai/test.sh @@ -1,3 +1,42 @@ #!/bin/bash -echo "This is a placeholder" +### Helpers ### + + +# TODO: Detect vllm vs trtllm +function install_deps() { + pushd openai/docker + pip install -r requirements.txt + pip install -r requirements_vllm.txt + popd +} + +function pre_test() { + + rm -rf openai/ + rm -f *.xml *.log + + # TODO: Use this instead when moving to devel container + # cp -r ../../python/openai . + cp -r /mnt/server/python/openai . + + install_deps +} + +function run_test() { + pushd openai/tests + pytest -s -v --junitxml=test_openai.xml 2>&1 | tee test_openai.log + cp *.xml *.log ../../ + popd +} + +function post_test() { + # no-op +} + + +### Test ### + +pre_test +run_test +post_test From c2786b2aa055b8a7ac65c31123d1159603da7471 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Tue, 20 Aug 2024 17:55:50 -0700 Subject: [PATCH 35/80] Cleanup test script for local file reference --- qa/L0_openai/test.sh | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/qa/L0_openai/test.sh b/qa/L0_openai/test.sh index 5f34aea1e8..a746e04821 100755 --- a/qa/L0_openai/test.sh +++ b/qa/L0_openai/test.sh @@ -12,14 +12,12 @@ function install_deps() { } function pre_test() { - + # Cleanup rm -rf openai/ rm -f *.xml *.log - # TODO: Use this instead when moving to devel container - # cp -r ../../python/openai . - cp -r /mnt/server/python/openai . - + # Prep test environment + cp -r ../../python/openai . install_deps } From 021c577db3152359740dca27d58c14a188b13dbc Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Tue, 20 Aug 2024 21:56:11 -0700 Subject: [PATCH 36/80] Fix paths and empty function --- qa/L0_openai/test.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/qa/L0_openai/test.sh b/qa/L0_openai/test.sh index a746e04821..ab2ae8a8d7 100755 --- a/qa/L0_openai/test.sh +++ b/qa/L0_openai/test.sh @@ -22,14 +22,15 @@ function pre_test() { } function run_test() { - pushd openai/tests + pushd openai/openai/tests pytest -s -v --junitxml=test_openai.xml 2>&1 | tee test_openai.log - cp *.xml *.log ../../ + cp *.xml *.log ../../../ popd } function post_test() { - # no-op + # Placeholder + echo "post_test" } From a69bfd15fc50e7801f909907bef61f60138bef46 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Tue, 20 Aug 2024 22:00:45 -0700 Subject: [PATCH 37/80] Install tritonserver python wheel --- qa/L0_openai/test.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/qa/L0_openai/test.sh b/qa/L0_openai/test.sh index ab2ae8a8d7..c6349507dd 100755 --- a/qa/L0_openai/test.sh +++ b/qa/L0_openai/test.sh @@ -6,6 +6,7 @@ # TODO: Detect vllm vs trtllm function install_deps() { pushd openai/docker + pip install /opt/tritonserver/python/triton*.whl pip install -r requirements.txt pip install -r requirements_vllm.txt popd From 6361bd1cd0a87b8587c238a7f15d27f279bbec38 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Tue, 20 Aug 2024 22:57:01 -0700 Subject: [PATCH 38/80] Add TRT-LLM detection and model repo generation --- qa/L0_openai/test.sh | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/qa/L0_openai/test.sh b/qa/L0_openai/test.sh index c6349507dd..33ada18a29 100755 --- a/qa/L0_openai/test.sh +++ b/qa/L0_openai/test.sh @@ -2,16 +2,49 @@ ### Helpers ### +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +DATADIR=${DATADIR:="/data/inferenceserver/${REPO_VERSION}"} +# NOTE: This default path doesn't make sense if run on non-A100 GPU +ENGINE_DEST_PATH=${ENGINE_DEST_PATH:="${DATADIR}/trtllm_engines_A100"} -# TODO: Detect vllm vs trtllm function install_deps() { pushd openai/docker pip install /opt/tritonserver/python/triton*.whl pip install -r requirements.txt - pip install -r requirements_vllm.txt + if [ "${IMAGE_KIND}" == "TRTLLM" ]; then + prepare_tensorrtllm + else + prepare_vllm + fi popd } +function prepare_vllm() { + pip install -r requirements_vllm.txt +} + +function prepare_tensorrtllm() { + # Use Triton CLI to prepare model repository for testing + pip install git+https://github.com/triton-inference-server/triton_cli.git@0.0.10 + MODEL_REPO="../openai/tests/tensorrtllm_models" + rm -rf ${MODEL_REPO} + # Use ENGINE_DEST_PATH to re-use NFS mount when possible + ENGINE_DEST_PATH="${ENGINE_DEST_PATH}" triton import \ + --model llama-3-8b-instruct \ + --backend tensorrtllm \ + --model-repository "${MODEL_REPO}" + # WAR for tests expecting default name of "tensorrt_llm_bls" + mv "${MODEL_REPO}/llama-3-8b-instruct" "${MODEL_REPO}/tensorrt_llm_bls" +} + function pre_test() { # Cleanup rm -rf openai/ From c096ba5b8a661308f02abf299eb213a4b321e4d6 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Tue, 20 Aug 2024 23:26:17 -0700 Subject: [PATCH 39/80] Fix trtllm model count comparison to 4, excluding ensemble --- python/openai/openai/tests/test_openai_client.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/openai/openai/tests/test_openai_client.py b/python/openai/openai/tests/test_openai_client.py index 6c61403e73..7a2a08fe19 100644 --- a/python/openai/openai/tests/test_openai_client.py +++ b/python/openai/openai/tests/test_openai_client.py @@ -13,9 +13,9 @@ def test_openai_client_models(self, client: openai.OpenAI, backend: str): models = list(client.models.list()) print(f"Models: {models}") if backend == "tensorrtllm": - # ensemble or tensorrt_llm_bls + # tensorrt_llm_bls + # preprocess -> tensorrt_llm -> postprocess - assert len(models) == 5 + assert len(models) == 4 elif backend == "vllm": assert len(models) == 1 else: @@ -79,9 +79,9 @@ async def test_openai_client_models(self, client: openai.AsyncOpenAI, backend: s models = [model async for model in async_models] print(f"Models: {models}") if backend == "tensorrtllm": - # ensemble or tensorrt_llm_bls + # tensorrt_llm_bls + # preprocess -> tensorrt_llm -> postprocess - assert len(models) == 5 + assert len(models) == 4 elif backend == "vllm": assert len(models) == 1 else: From 563123129446c2a47783f4ffefb1f35241fbbe55 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Tue, 20 Aug 2024 23:33:31 -0700 Subject: [PATCH 40/80] Fail on pytest errors --- qa/L0_openai/test.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/qa/L0_openai/test.sh b/qa/L0_openai/test.sh index 33ada18a29..2c26072bfc 100755 --- a/qa/L0_openai/test.sh +++ b/qa/L0_openai/test.sh @@ -57,7 +57,9 @@ function pre_test() { function run_test() { pushd openai/openai/tests + set +e pytest -s -v --junitxml=test_openai.xml 2>&1 | tee test_openai.log + set -e cp *.xml *.log ../../../ popd } From e77f85cc3596c7dc0acb104bb69a4c6c58fc6a24 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Wed, 21 Aug 2024 10:58:55 -0700 Subject: [PATCH 41/80] Try copying engines out of NFS mount for faster test I/O --- qa/L0_openai/test.sh | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/qa/L0_openai/test.sh b/qa/L0_openai/test.sh index 2c26072bfc..3d609762e7 100755 --- a/qa/L0_openai/test.sh +++ b/qa/L0_openai/test.sh @@ -11,9 +11,6 @@ if [ -z "$REPO_VERSION" ]; then echo -e "\n***\n*** Test Failed\n***" exit 1 fi -DATADIR=${DATADIR:="/data/inferenceserver/${REPO_VERSION}"} -# NOTE: This default path doesn't make sense if run on non-A100 GPU -ENGINE_DEST_PATH=${ENGINE_DEST_PATH:="${DATADIR}/trtllm_engines_A100"} function install_deps() { pushd openai/docker @@ -32,15 +29,32 @@ function prepare_vllm() { } function prepare_tensorrtllm() { + DATADIR=${DATADIR:="/data/inferenceserver/${REPO_VERSION}"} + # NOTE: This default path doesn't make sense if run on non-A100 GPU + NFS_ENGINE_DEST_PATH=${ENGINE_DEST_PATH:="${DATADIR}/trtllm_engines_A100"} + LOCAL_ENGINE_DEST_PATH="./trtllm_engines_A100" + + MODEL="llama-3-8b-instruct" + MODEL_REPO="../openai/tests/tensorrtllm_models" + rm -rf ${MODEL_REPO} ${LOCAL_ENGINE_DEST_PATH} + + # FIXME: This will require an upgrade each release to match the TRT-LLM version # Use Triton CLI to prepare model repository for testing pip install git+https://github.com/triton-inference-server/triton_cli.git@0.0.10 - MODEL_REPO="../openai/tests/tensorrtllm_models" - rm -rf ${MODEL_REPO} - # Use ENGINE_DEST_PATH to re-use NFS mount when possible - ENGINE_DEST_PATH="${ENGINE_DEST_PATH}" triton import \ - --model llama-3-8b-instruct \ + # Use ENGINE_DEST_PATH to re-use NFS mount when possible and skip engine build + ENGINE_DEST_PATH="${NFS_ENGINE_DEST_PATH}" triton import \ + --model {$MODEL} \ --backend tensorrtllm \ --model-repository "${MODEL_REPO}" + rm -rf "${MODEL_REPO}" + + # To avoid too much I/O with NFS mount at test time, copy it out to a local dir first. + cp -r ${NFS_ENGINE_DEST_PATH} ${LOCAL_ENGINE_DEST_PATH} + ENGINE_DEST_PATH="${LOCAL_ENGINE_DEST_PATH}" triton import \ + --model {$MODEL} \ + --backend tensorrtllm \ + --model-repository "${MODEL_REPO}" + # WAR for tests expecting default name of "tensorrt_llm_bls" mv "${MODEL_REPO}/llama-3-8b-instruct" "${MODEL_REPO}/tensorrt_llm_bls" } From b41a6f77485034a72529e950b9691e2d62f08983 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Wed, 21 Aug 2024 12:12:14 -0700 Subject: [PATCH 42/80] Use model var --- qa/L0_openai/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qa/L0_openai/test.sh b/qa/L0_openai/test.sh index 3d609762e7..325b0488aa 100755 --- a/qa/L0_openai/test.sh +++ b/qa/L0_openai/test.sh @@ -56,7 +56,7 @@ function prepare_tensorrtllm() { --model-repository "${MODEL_REPO}" # WAR for tests expecting default name of "tensorrt_llm_bls" - mv "${MODEL_REPO}/llama-3-8b-instruct" "${MODEL_REPO}/tensorrt_llm_bls" + mv "${MODEL_REPO}/${MODEL}" "${MODEL_REPO}/tensorrt_llm_bls" } function pre_test() { From 82519237618776b89d98d27db5ac281f81aecb6f Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Wed, 21 Aug 2024 12:16:45 -0700 Subject: [PATCH 43/80] Time the duration of copying from nfs mount --- qa/L0_openai/test.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/qa/L0_openai/test.sh b/qa/L0_openai/test.sh index 325b0488aa..6088acbac2 100755 --- a/qa/L0_openai/test.sh +++ b/qa/L0_openai/test.sh @@ -43,15 +43,15 @@ function prepare_tensorrtllm() { pip install git+https://github.com/triton-inference-server/triton_cli.git@0.0.10 # Use ENGINE_DEST_PATH to re-use NFS mount when possible and skip engine build ENGINE_DEST_PATH="${NFS_ENGINE_DEST_PATH}" triton import \ - --model {$MODEL} \ + --model ${MODEL} \ --backend tensorrtllm \ --model-repository "${MODEL_REPO}" rm -rf "${MODEL_REPO}" # To avoid too much I/O with NFS mount at test time, copy it out to a local dir first. - cp -r ${NFS_ENGINE_DEST_PATH} ${LOCAL_ENGINE_DEST_PATH} + time cp -r ${NFS_ENGINE_DEST_PATH} ${LOCAL_ENGINE_DEST_PATH} ENGINE_DEST_PATH="${LOCAL_ENGINE_DEST_PATH}" triton import \ - --model {$MODEL} \ + --model ${MODEL} \ --backend tensorrtllm \ --model-repository "${MODEL_REPO}" @@ -71,9 +71,7 @@ function pre_test() { function run_test() { pushd openai/openai/tests - set +e pytest -s -v --junitxml=test_openai.xml 2>&1 | tee test_openai.log - set -e cp *.xml *.log ../../../ popd } @@ -86,6 +84,8 @@ function post_test() { ### Test ### +set +e pre_test run_test post_test +set -e From f928a810b3cca2703d96634a9cf3ac25194aa44e Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Wed, 21 Aug 2024 12:30:44 -0700 Subject: [PATCH 44/80] Try rsync over cp --- qa/L0_openai/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qa/L0_openai/test.sh b/qa/L0_openai/test.sh index 6088acbac2..23ae622e4a 100755 --- a/qa/L0_openai/test.sh +++ b/qa/L0_openai/test.sh @@ -49,7 +49,7 @@ function prepare_tensorrtllm() { rm -rf "${MODEL_REPO}" # To avoid too much I/O with NFS mount at test time, copy it out to a local dir first. - time cp -r ${NFS_ENGINE_DEST_PATH} ${LOCAL_ENGINE_DEST_PATH} + time rsync -ah ${NFS_ENGINE_DEST_PATH} ${LOCAL_ENGINE_DEST_PATH} ENGINE_DEST_PATH="${LOCAL_ENGINE_DEST_PATH}" triton import \ --model ${MODEL} \ --backend tensorrtllm \ From 81ef479164f6d2f5e2fb1c131312fa991f499b95 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Wed, 21 Aug 2024 14:46:20 -0700 Subject: [PATCH 45/80] Remove use of NFS mount due to slow I/O for now --- qa/L0_openai/test.sh | 30 +++--------------------------- 1 file changed, 3 insertions(+), 27 deletions(-) diff --git a/qa/L0_openai/test.sh b/qa/L0_openai/test.sh index 23ae622e4a..96e30cda87 100755 --- a/qa/L0_openai/test.sh +++ b/qa/L0_openai/test.sh @@ -2,16 +2,6 @@ ### Helpers ### -REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} -if [ "$#" -ge 1 ]; then - REPO_VERSION=$1 -fi -if [ -z "$REPO_VERSION" ]; then - echo -e "Repository version must be specified" - echo -e "\n***\n*** Test Failed\n***" - exit 1 -fi - function install_deps() { pushd openai/docker pip install /opt/tritonserver/python/triton*.whl @@ -29,28 +19,15 @@ function prepare_vllm() { } function prepare_tensorrtllm() { - DATADIR=${DATADIR:="/data/inferenceserver/${REPO_VERSION}"} - # NOTE: This default path doesn't make sense if run on non-A100 GPU - NFS_ENGINE_DEST_PATH=${ENGINE_DEST_PATH:="${DATADIR}/trtllm_engines_A100"} - LOCAL_ENGINE_DEST_PATH="./trtllm_engines_A100" - MODEL="llama-3-8b-instruct" MODEL_REPO="../openai/tests/tensorrtllm_models" - rm -rf ${MODEL_REPO} ${LOCAL_ENGINE_DEST_PATH} + rm -rf ${MODEL_REPO} # FIXME: This will require an upgrade each release to match the TRT-LLM version # Use Triton CLI to prepare model repository for testing pip install git+https://github.com/triton-inference-server/triton_cli.git@0.0.10 - # Use ENGINE_DEST_PATH to re-use NFS mount when possible and skip engine build - ENGINE_DEST_PATH="${NFS_ENGINE_DEST_PATH}" triton import \ - --model ${MODEL} \ - --backend tensorrtllm \ - --model-repository "${MODEL_REPO}" - rm -rf "${MODEL_REPO}" - - # To avoid too much I/O with NFS mount at test time, copy it out to a local dir first. - time rsync -ah ${NFS_ENGINE_DEST_PATH} ${LOCAL_ENGINE_DEST_PATH} - ENGINE_DEST_PATH="${LOCAL_ENGINE_DEST_PATH}" triton import \ + # NOTE: Could use ENGINE_DEST_PATH set to NFS mount for pre-built engines in future + triton import \ --model ${MODEL} \ --backend tensorrtllm \ --model-repository "${MODEL_REPO}" @@ -81,7 +58,6 @@ function post_test() { echo "post_test" } - ### Test ### set +e From 42676da9f611c1d2dc22848d21a41c3bafd9b936 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Wed, 21 Aug 2024 16:14:37 -0700 Subject: [PATCH 46/80] Propagate test failure to job failure and log collection --- qa/L0_openai/test.sh | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/qa/L0_openai/test.sh b/qa/L0_openai/test.sh index 96e30cda87..8d73397f75 100755 --- a/qa/L0_openai/test.sh +++ b/qa/L0_openai/test.sh @@ -48,20 +48,28 @@ function pre_test() { function run_test() { pushd openai/openai/tests - pytest -s -v --junitxml=test_openai.xml 2>&1 | tee test_openai.log + TEST_LOG="test_openai.log" + + # Capture error code without exiting to allow log collection + set +e + pytest -s -v --junitxml=test_openai.xml 2>&1 > ${TEST_LOG} + if [ $? -ne 0 ]; then + cat ${TEST_LOG} + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + set -e + + # Collect logs for error analysis when needed cp *.xml *.log ../../../ popd } -function post_test() { - # Placeholder - echo "post_test" -} - ### Test ### -set +e +RET=0 + pre_test run_test -post_test -set -e + +exit ${RET} From cacaf0bfc25e33874b8ecb4a7f4a40dac7b46d58 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Wed, 21 Aug 2024 16:15:29 -0700 Subject: [PATCH 47/80] Add xml files to gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index d553dfde16..e5b4814197 100644 --- a/.gitignore +++ b/.gitignore @@ -5,5 +5,6 @@ __pycache__ tmp *.log +*.xml test_results.txt artifacts From b6c3f9ec108140799a27590fc074339c18697632 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Wed, 21 Aug 2024 16:38:47 -0700 Subject: [PATCH 48/80] Test /v1/models with multiple models and remove TODOs --- python/openai/README.md | 5 +- .../openai/openai/tests/test_completions.py | 3 +- .../tests/test_models/mock_llm_2/1/model.py | 108 ++++++++++++++++++ .../tests/test_models/mock_llm_2/config.pbtxt | 60 ++++++++++ .../openai/openai/tests/test_observability.py | 15 ++- 5 files changed, 178 insertions(+), 13 deletions(-) create mode 100644 python/openai/openai/tests/test_models/mock_llm_2/1/model.py create mode 100644 python/openai/openai/tests/test_models/mock_llm_2/config.pbtxt diff --git a/python/openai/README.md b/python/openai/README.md index 13a5e7a73b..c48c151637 100644 --- a/python/openai/README.md +++ b/python/openai/README.md @@ -107,10 +107,9 @@ pytest -v **NOTE**: The workflow for preparing TRT-LLM engines, model repository, etc. in order to load and test is not fleshed out in the README here yet. You can try using the Triton CLI or follow existing TRT-LLM backend examples to prepare a model repository, and point -at the model repository accordingly when following the examples. This should be fleshed out - or cleaned up in the future. +at the model repository accordingly when following the examples. -0. `[TODO]` Prepare your model repository for a TensorRT-LLM model, build the engine, etc. +0. Prepare your model repository for a TensorRT-LLM model, build the engine, etc. 1. Build and launch the container: - Mounts the openai source files to `/workspace` for simplicity, later on these will be shipped in the container. diff --git a/python/openai/openai/tests/test_completions.py b/python/openai/openai/tests/test_completions.py index e43e225988..b5b7a6f2f5 100644 --- a/python/openai/openai/tests/test_completions.py +++ b/python/openai/openai/tests/test_completions.py @@ -48,7 +48,7 @@ def test_completions_sampling_parameters( ) print("Response:", response.json()) - # TODO: Add support and remove this check + # FIXME: Add support and remove this check unsupported_parameters = ["logprobs", "logit_bias"] if sampling_parameter in unsupported_parameters: assert response.status_code == 400 @@ -315,7 +315,6 @@ def test_lora(self): def test_multi_lora(self): pass - # TODO: Do we want to support "usage" field for token counts in response? @pytest.mark.skip(reason="Not Implemented Yet") def test_usage_response(self): pass diff --git a/python/openai/openai/tests/test_models/mock_llm_2/1/model.py b/python/openai/openai/tests/test_models/mock_llm_2/1/model.py new file mode 100644 index 0000000000..1cf5f3613c --- /dev/null +++ b/python/openai/openai/tests/test_models/mock_llm_2/1/model.py @@ -0,0 +1,108 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import time + +import numpy as np +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def initialize(self, args): + self.model_config = json.loads(args["model_config"]) + self.decoupled = self.model_config.get("model_transaction_policy", {}).get( + "decoupled" + ) + + def execute(self, requests): + if self.decoupled: + return self.exec_decoupled(requests) + else: + return self.exec(requests) + + def exec(self, requests): + responses = [] + for request in requests: + params = json.loads(request.parameters()) + rep_count = params["REPETITION"] if "REPETITION" in params else 1 + + input_np = pb_utils.get_input_tensor_by_name( + request, "text_intpu" + ).as_numpy() + stream_np = pb_utils.get_input_tensor_by_name(request, "stream").as_numpy() + stream = stream_np.flatten()[0] + if stream: + responses.append( + pb_utils.InferenceResponse( + error=pb_utils.TritonError( + "STREAM only supported in decoupled mode" + ) + ) + ) + else: + out_tensor = pb_utils.Tensor( + "text_output", np.repeat(input_np, rep_count, axis=1) + ) + responses.append(pb_utils.InferenceResponse([out_tensor])) + return responses + + def exec_decoupled(self, requests): + for request in requests: + params = json.loads(request.parameters()) + rep_count = params["REPETITION"] if "REPETITION" in params else 1 + fail_last = params["FAIL_LAST"] if "FAIL_LAST" in params else False + delay = params["DELAY"] if "DELAY" in params else None + + sender = request.get_response_sender() + input_np = pb_utils.get_input_tensor_by_name( + request, "text_input" + ).as_numpy() + stream_np = pb_utils.get_input_tensor_by_name(request, "stream").as_numpy() + out_tensor = pb_utils.Tensor("text_output", input_np) + response = pb_utils.InferenceResponse([out_tensor]) + # If stream enabled, just send multiple copies of response + # FIXME: Could split up response string into tokens, but this is simpler for now. + stream = stream_np.flatten()[0] + if stream: + for _ in range(rep_count): + if delay is not None: + time.sleep(delay) + sender.send(response) + sender.send( + None + if not fail_last + else pb_utils.InferenceResponse( + error=pb_utils.TritonError("An Error Occurred") + ), + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL, + ) + # If stream disabled, just send one response + else: + sender.send( + response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) + return None diff --git a/python/openai/openai/tests/test_models/mock_llm_2/config.pbtxt b/python/openai/openai/tests/test_models/mock_llm_2/config.pbtxt new file mode 100644 index 0000000000..5f665ff543 --- /dev/null +++ b/python/openai/openai/tests/test_models/mock_llm_2/config.pbtxt @@ -0,0 +1,60 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +backend: "python" + +max_batch_size: 0 + +model_transaction_policy { + decoupled: True +} + +input [ + { + name: "text_input" + data_type: TYPE_STRING + dims: [ 1, 1 ] + }, + { + name: "stream" + data_type: TYPE_BOOL + dims: [ 1, 1 ] + } +] + +output [ + { + name: "text_output" + data_type: TYPE_STRING + dims: [ 1, -1 ] + } +] + +instance_group [ + { + count: 1 + kind: KIND_MODEL + } +] diff --git a/python/openai/openai/tests/test_observability.py b/python/openai/openai/tests/test_observability.py index eca88a03de..7a4a4d7e81 100644 --- a/python/openai/openai/tests/test_observability.py +++ b/python/openai/openai/tests/test_observability.py @@ -49,19 +49,18 @@ def test_startup_metrics(self, client): assert "nv_cpu_utilization" in response.json() ### Models ### - def test_models_list(self, client, model): - # TODO: Load multiple models and make sure exactly ALL are returned + def test_models_list(self, client): response = client.get("/v1/models") assert response.status_code == 200 models = response.json()["data"] - assert len(models) == 1 - assert models[0]["id"] == model - assert models[0]["object"] == "model" - assert models[0]["created"] > 0 - assert models[0]["owned_by"] == "Triton Inference Server" + assert len(models) == 2 + for model in models: + assert model["id"] + assert model["object"] == "model" + assert model["created"] > 0 + assert model["owned_by"] == "Triton Inference Server" def test_models_get(self, client, model): - # TODO: Load multiple models and make sure exactly 1 is returned response = client.get(f"/v1/models/{model}") assert response.status_code == 200 model_resp = response.json() From 5cc80fe4612de3cda6cd8e83c139d6857a714f72 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Wed, 21 Aug 2024 16:39:46 -0700 Subject: [PATCH 49/80] Add openai folder copy to gitignore in testing --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index e5b4814197..01c71fd9b8 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ tmp *.xml test_results.txt artifacts +qa/L0_openai/openai From 9f70a1dabead0af6de92074680b5f1d3c62614ec Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Wed, 21 Aug 2024 16:55:34 -0700 Subject: [PATCH 50/80] Add streaming completion test, remove trtllm models from git repo --- .gitignore | 3 + .../tests/tensorrtllm_models/ensemble/1/.tmp | 0 .../tensorrtllm_models/ensemble/config.pbtxt | 470 ----------- .../postprocessing/1/model.py | 246 ------ .../postprocessing/config.pbtxt | 113 --- .../preprocessing/1/model.py | 418 --------- .../preprocessing/config.pbtxt | 156 ---- .../tensorrt_llm/1/.gitkeep | 0 .../tensorrt_llm/1/model.py | 797 ------------------ .../tensorrt_llm/config.pbtxt | 542 ------------ .../tensorrt_llm_bls/1/lib/decode.py | 347 -------- .../tensorrt_llm_bls/1/lib/triton_decoder.py | 478 ----------- .../tensorrt_llm_bls/1/model.py | 137 --- .../tensorrt_llm_bls/config.pbtxt | 252 ------ .../openai/tests/test_chat_completions.py | 3 +- .../openai/openai/tests/test_openai_client.py | 44 +- 16 files changed, 42 insertions(+), 3964 deletions(-) delete mode 100644 python/openai/openai/tests/tensorrtllm_models/ensemble/1/.tmp delete mode 100644 python/openai/openai/tests/tensorrtllm_models/ensemble/config.pbtxt delete mode 100644 python/openai/openai/tests/tensorrtllm_models/postprocessing/1/model.py delete mode 100644 python/openai/openai/tests/tensorrtllm_models/postprocessing/config.pbtxt delete mode 100644 python/openai/openai/tests/tensorrtllm_models/preprocessing/1/model.py delete mode 100644 python/openai/openai/tests/tensorrtllm_models/preprocessing/config.pbtxt delete mode 100644 python/openai/openai/tests/tensorrtllm_models/tensorrt_llm/1/.gitkeep delete mode 100644 python/openai/openai/tests/tensorrtllm_models/tensorrt_llm/1/model.py delete mode 100644 python/openai/openai/tests/tensorrtllm_models/tensorrt_llm/config.pbtxt delete mode 100644 python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/decode.py delete mode 100644 python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/triton_decoder.py delete mode 100644 python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/1/model.py delete mode 100644 python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/config.pbtxt diff --git a/.gitignore b/.gitignore index 01c71fd9b8..02be5ecddc 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,7 @@ tmp *.xml test_results.txt artifacts + +# Test exclusions qa/L0_openai/openai +qa/L0_openai/tensorrtllm_models diff --git a/python/openai/openai/tests/tensorrtllm_models/ensemble/1/.tmp b/python/openai/openai/tests/tensorrtllm_models/ensemble/1/.tmp deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/python/openai/openai/tests/tensorrtllm_models/ensemble/config.pbtxt b/python/openai/openai/tests/tensorrtllm_models/ensemble/config.pbtxt deleted file mode 100644 index b82990446d..0000000000 --- a/python/openai/openai/tests/tensorrtllm_models/ensemble/config.pbtxt +++ /dev/null @@ -1,470 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -name: "ensemble" -platform: "ensemble" -max_batch_size: 64 -input [ - { - name: "text_input" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "decoder_text_input" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "max_tokens" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "bad_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "stop_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "end_id" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "pad_id" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "top_k" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "top_p" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "temperature" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "length_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "repetition_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "min_length" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "presence_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "frequency_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "random_seed" - data_type: TYPE_UINT64 - dims: [ 1 ] - optional: true - }, - { - name: "return_log_probs" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "return_context_logits" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "return_generation_logits" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "beam_width" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "stream" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "prompt_embedding_table" - data_type: TYPE_FP16 - dims: [ -1, -1 ] - optional: true - }, - { - name: "prompt_vocab_size" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "embedding_bias_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "embedding_bias_weights" - data_type: TYPE_FP32 - dims: [ -1 ] - optional: true - } -] -output [ - { - name: "text_output" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "cum_log_probs" - data_type: TYPE_FP32 - dims: [ -1 ] - }, - { - name: "output_log_probs" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - }, - { - name: "context_logits" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - }, - { - name: "generation_logits" - data_type: TYPE_FP32 - dims: [ -1, -1, -1 ] - } -] -ensemble_scheduling { - step [ - { - model_name: "preprocessing" - model_version: -1 - input_map { - key: "QUERY" - value: "text_input" - } - input_map { - key: "DECODER_QUERY" - value: "decoder_text_input" - } - input_map { - key: "REQUEST_OUTPUT_LEN" - value: "max_tokens" - } - input_map { - key: "BAD_WORDS_DICT" - value: "bad_words" - } - input_map { - key: "STOP_WORDS_DICT" - value: "stop_words" - } - input_map { - key: "EMBEDDING_BIAS_WORDS" - value: "embedding_bias_words" - } - input_map { - key: "EMBEDDING_BIAS_WEIGHTS" - value: "embedding_bias_weights" - } - input_map { - key: "END_ID" - value: "end_id" - } - input_map { - key: "PAD_ID" - value: "pad_id" - } - output_map { - key: "REQUEST_INPUT_LEN" - value: "_REQUEST_INPUT_LEN" - } - output_map { - key: "INPUT_ID" - value: "_INPUT_ID" - } - output_map { - key: "REQUEST_DECODER_INPUT_LEN" - value: "_REQUEST_DECODER_INPUT_LEN" - } - output_map { - key: "DECODER_INPUT_ID" - value: "_DECODER_INPUT_ID" - } - output_map { - key: "REQUEST_OUTPUT_LEN" - value: "_REQUEST_OUTPUT_LEN" - } - output_map { - key: "STOP_WORDS_IDS" - value: "_STOP_WORDS_IDS" - } - output_map { - key: "BAD_WORDS_IDS" - value: "_BAD_WORDS_IDS" - } - output_map { - key: "EMBEDDING_BIAS" - value: "_EMBEDDING_BIAS" - } - output_map { - key: "OUT_END_ID" - value: "_PREPROCESSOR_END_ID" - } - output_map { - key: "OUT_PAD_ID" - value: "_PREPROCESSOR_PAD_ID" - } - }, - { - model_name: "tensorrt_llm" - model_version: -1 - input_map { - key: "input_ids" - value: "_INPUT_ID" - } - input_map { - key: "decoder_input_ids" - value: "_DECODER_INPUT_ID" - } - input_map { - key: "input_lengths" - value: "_REQUEST_INPUT_LEN" - } - input_map { - key: "decoder_input_lengths" - value: "_REQUEST_DECODER_INPUT_LEN" - } - input_map { - key: "request_output_len" - value: "_REQUEST_OUTPUT_LEN" - } - input_map { - key: "end_id" - value: "_PREPROCESSOR_END_ID" - } - input_map { - key: "pad_id" - value: "_PREPROCESSOR_PAD_ID" - } - input_map { - key: "embedding_bias" - value: "_EMBEDDING_BIAS" - } - input_map { - key: "runtime_top_k" - value: "top_k" - } - input_map { - key: "runtime_top_p" - value: "top_p" - } - input_map { - key: "temperature" - value: "temperature" - } - input_map { - key: "len_penalty" - value: "length_penalty" - } - input_map { - key: "repetition_penalty" - value: "repetition_penalty" - } - input_map { - key: "min_length" - value: "min_length" - } - input_map { - key: "presence_penalty" - value: "presence_penalty" - } - input_map { - key: "frequency_penalty" - value: "frequency_penalty" - } - input_map { - key: "random_seed" - value: "random_seed" - } - input_map { - key: "return_log_probs" - value: "return_log_probs" - } - input_map { - key: "return_context_logits" - value: "return_context_logits" - } - input_map { - key: "return_generation_logits" - value: "return_generation_logits" - } - input_map { - key: "beam_width" - value: "beam_width" - } - input_map { - key: "streaming" - value: "stream" - } - input_map { - key: "prompt_embedding_table" - value: "prompt_embedding_table" - } - input_map { - key: "prompt_vocab_size" - value: "prompt_vocab_size" - } - input_map { - key: "stop_words_list" - value: "_STOP_WORDS_IDS" - } - input_map { - key: "bad_words_list" - value: "_BAD_WORDS_IDS" - } - output_map { - key: "output_ids" - value: "_TOKENS_BATCH" - } - output_map { - key: "sequence_length" - value: "_SEQUENCE_LENGTH" - }, - output_map { - key: "cum_log_probs" - value: "_CUM_LOG_PROBS" - } - output_map { - key: "output_log_probs" - value: "_OUTPUT_LOG_PROBS" - }, - output_map { - key: "context_logits" - value: "_CONTEXT_LOGITS" - }, - output_map { - key: "generation_logits" - value: "_GENERATION_LOGITS" - } - }, - { - model_name: "postprocessing" - model_version: -1 - input_map { - key: "TOKENS_BATCH" - value: "_TOKENS_BATCH" - } - input_map { - key: "CUM_LOG_PROBS" - value: "_CUM_LOG_PROBS" - } - input_map { - key: "OUTPUT_LOG_PROBS" - value: "_OUTPUT_LOG_PROBS" - } - input_map { - key: "CONTEXT_LOGITS" - value: "_CONTEXT_LOGITS" - } - input_map { - key: "GENERATION_LOGITS" - value: "_GENERATION_LOGITS" - } - input_map { - key: "SEQUENCE_LENGTH" - value: "_SEQUENCE_LENGTH" - } - output_map { - key: "OUTPUT" - value: "text_output" - } - output_map { - key: "OUT_OUTPUT_LOG_PROBS" - value: "output_log_probs" - } - output_map { - key: "OUT_CUM_LOG_PROBS" - value: "cum_log_probs" - } - output_map { - key: "OUT_CONTEXT_LOGITS" - value: "context_logits" - } - output_map { - key: "OUT_GENERATION_LOGITS" - value: "generation_logits" - } - } - ] -} diff --git a/python/openai/openai/tests/tensorrtllm_models/postprocessing/1/model.py b/python/openai/openai/tests/tensorrtllm_models/postprocessing/1/model.py deleted file mode 100644 index 0812e19b3e..0000000000 --- a/python/openai/openai/tests/tensorrtllm_models/postprocessing/1/model.py +++ /dev/null @@ -1,246 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import json - -import numpy as np -import triton_python_backend_utils as pb_utils -from transformers import AutoTokenizer - - -class TritonPythonModel: - """Your Python model must use the same class name. Every Python model - that is created must have "TritonPythonModel" as the class name. - """ - - def initialize(self, args): - """`initialize` is called only once when the model is being loaded. - Implementing `initialize` function is optional. This function allows - the model to initialize any state associated with this model. - Parameters - ---------- - args : dict - Both keys and values are strings. The dictionary keys and values are: - * model_config: A JSON string containing the model configuration - * model_instance_kind: A string containing model instance kind - * model_instance_device_id: A string containing model instance device ID - * model_repository: Model repository path - * model_version: Model version - * model_name: Model name - """ - # Parse model configs - model_config = json.loads(args["model_config"]) - tokenizer_dir = model_config["parameters"]["tokenizer_dir"]["string_value"] - - skip_special_tokens = model_config["parameters"].get("skip_special_tokens") - if skip_special_tokens is not None: - skip_special_tokens_str = skip_special_tokens["string_value"].lower() - if skip_special_tokens_str in [ - "true", - "false", - "1", - "0", - "t", - "f", - "y", - "n", - "yes", - "no", - ]: - self.skip_special_tokens = skip_special_tokens_str in [ - "true", - "1", - "t", - "y", - "yes", - ] - else: - print( - f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is {skip_special_tokens['string_value']}). Set it as True by default." - ) - self.skip_special_tokens = True - else: - print( - f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens'. Set it as True by default." - ) - self.skip_special_tokens = True - - self.tokenizer = AutoTokenizer.from_pretrained( - tokenizer_dir, legacy=False, padding_side="left", trust_remote_code=True - ) - if not self.tokenizer.pad_token: - self.tokenizer.pad_token = self.tokenizer.eos_token - - # Parse model output configs - output_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT") - - # Convert Triton types to numpy types - self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"]) - - def execute(self, requests): - """`execute` must be implemented in every Python model. `execute` - function receives a list of pb_utils.InferenceRequest as the only - argument. This function is called when an inference is requested - for this model. Depending on the batching configuration (e.g. Dynamic - Batching) used, `requests` may contain multiple requests. Every - Python model, must create one pb_utils.InferenceResponse for every - pb_utils.InferenceRequest in `requests`. If there is an error, you can - set the error argument when creating a pb_utils.InferenceResponse. - Parameters - ---------- - requests : list - A list of pb_utils.InferenceRequest - Returns - ------- - list - A list of pb_utils.InferenceResponse. The length of this list must - be the same as `requests` - """ - - responses = [] - - # Every Python backend must iterate over everyone of the requests - # and create a pb_utils.InferenceResponse for each of them. - for idx, request in enumerate(requests): - # Get input tensors - tokens_batch = pb_utils.get_input_tensor_by_name( - request, "TOKENS_BATCH" - ).as_numpy() - - # Get sequence length - sequence_lengths = pb_utils.get_input_tensor_by_name( - request, "SEQUENCE_LENGTH" - ).as_numpy() - - # Get cum log probs - cum_log_probs = pb_utils.get_input_tensor_by_name(request, "CUM_LOG_PROBS") - - # Get sequence length - output_log_probs = pb_utils.get_input_tensor_by_name( - request, "OUTPUT_LOG_PROBS" - ) - - # Get context logits - context_logits = pb_utils.get_input_tensor_by_name( - request, "CONTEXT_LOGITS" - ) - - # Get generation logits - generation_logits = pb_utils.get_input_tensor_by_name( - request, "GENERATION_LOGITS" - ) - - # Reshape Input - # tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]]) - # tokens_batch = tokens_batch.T - - # Postprocessing output data. - outputs = self._postprocessing(tokens_batch, sequence_lengths) - - # Create output tensors. You need pb_utils.Tensor - # objects to create pb_utils.InferenceResponse. - output_tensor = pb_utils.Tensor( - "OUTPUT", np.array(outputs).astype(self.output_dtype) - ) - - outputs = [] - outputs.append(output_tensor) - - if cum_log_probs: - out_cum_log_probs = pb_utils.Tensor( - "OUT_CUM_LOG_PROBS", cum_log_probs.as_numpy() - ) - outputs.append(out_cum_log_probs) - else: - out_cum_log_probs = pb_utils.Tensor( - "OUT_CUM_LOG_PROBS", np.array([[0.0]], dtype=np.float32) - ) - outputs.append(out_cum_log_probs) - - if output_log_probs: - out_output_log_probs = pb_utils.Tensor( - "OUT_OUTPUT_LOG_PROBS", output_log_probs.as_numpy() - ) - outputs.append(out_output_log_probs) - else: - out_output_log_probs = pb_utils.Tensor( - "OUT_OUTPUT_LOG_PROBS", np.array([[[0.0]]], dtype=np.float32) - ) - outputs.append(out_output_log_probs) - - if context_logits: - out_context_logits = pb_utils.Tensor( - "OUT_CONTEXT_LOGITS", context_logits.as_numpy() - ) - outputs.append(out_context_logits) - else: - out_context_logits = pb_utils.Tensor( - "OUT_CONTEXT_LOGITS", np.array([[[0.0]]], dtype=np.float32) - ) - outputs.append(out_context_logits) - - if generation_logits: - out_generation_logits = pb_utils.Tensor( - "OUT_GENERATION_LOGITS", generation_logits.as_numpy() - ) - outputs.append(out_generation_logits) - else: - out_generation_logits = pb_utils.Tensor( - "OUT_GENERATION_LOGITS", np.array([[[[0.0]]]], dtype=np.float32) - ) - outputs.append(out_generation_logits) - - # Create InferenceResponse. You can set an error here in case - # there was a problem with handling this inference request. - # Below is an example of how you can set errors in inference - # response: - # - # pb_utils.InferenceResponse( - # output_tensors=..., TritonError("An error occurred")) - inference_response = pb_utils.InferenceResponse(output_tensors=outputs) - responses.append(inference_response) - - # You should return a list of pb_utils.InferenceResponse. Length - # of this list must match the length of `requests` list. - return responses - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is optional. This function allows - the model to perform any necessary clean ups before exit. - """ - print("Cleaning up...") - - def _postprocessing(self, tokens_batch, sequence_lengths): - outputs = [] - for batch_idx, beam_tokens in enumerate(tokens_batch): - for beam_idx, tokens in enumerate(beam_tokens): - seq_len = sequence_lengths[batch_idx][beam_idx] - output = self.tokenizer.decode( - tokens[:seq_len], skip_special_tokens=self.skip_special_tokens - ) - outputs.append(output.encode("utf8")) - return outputs diff --git a/python/openai/openai/tests/tensorrtllm_models/postprocessing/config.pbtxt b/python/openai/openai/tests/tensorrtllm_models/postprocessing/config.pbtxt deleted file mode 100644 index dee851662d..0000000000 --- a/python/openai/openai/tests/tensorrtllm_models/postprocessing/config.pbtxt +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -name: "postprocessing" -backend: "python" -max_batch_size: 256 -input [ - { - name: "TOKENS_BATCH" - data_type: TYPE_INT32 - dims: [ -1, -1 ] - }, - { - name: "SEQUENCE_LENGTH" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "CUM_LOG_PROBS" - data_type: TYPE_FP32 - dims: [ -1 ] - optional: true - }, - { - name: "OUTPUT_LOG_PROBS" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - optional: true - }, - { - name: "CONTEXT_LOGITS" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - optional: true - }, - { - name: "GENERATION_LOGITS" - data_type: TYPE_FP32 - dims: [ -1, -1, -1 ] - optional: true - } -] -output [ - { - name: "OUTPUT" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "OUT_CUM_LOG_PROBS" - data_type: TYPE_FP32 - dims: [ -1 ] - }, - { - name: "OUT_OUTPUT_LOG_PROBS" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - }, - { - name: "OUT_CONTEXT_LOGITS" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - }, - { - name: "OUT_GENERATION_LOGITS" - data_type: TYPE_FP32 - dims: [ -1, -1, -1 ] - } -] - -parameters { - key: "tokenizer_dir" - value: { - string_value: "/tmp/engines/llama-3-8b-instruct/hf_download" - } -} - -parameters { - key: "skip_special_tokens" - value: { - string_value: "${skip_special_tokens}" - } -} - -instance_group [ - { - count: 1 - kind: KIND_CPU - } -] diff --git a/python/openai/openai/tests/tensorrtllm_models/preprocessing/1/model.py b/python/openai/openai/tests/tensorrtllm_models/preprocessing/1/model.py deleted file mode 100644 index eb4487c803..0000000000 --- a/python/openai/openai/tests/tensorrtllm_models/preprocessing/1/model.py +++ /dev/null @@ -1,418 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import json -from typing import List - -import numpy as np -import triton_python_backend_utils as pb_utils -from transformers import AutoTokenizer, T5Tokenizer - - -class TritonPythonModel: - """Your Python model must use the same class name. Every Python model - that is created must have "TritonPythonModel" as the class name. - """ - - def initialize(self, args): - """`initialize` is called only once when the model is being loaded. - Implementing `initialize` function is optional. This function allows - the model to initialize any state associated with this model. - Parameters - ---------- - args : dict - Both keys and values are strings. The dictionary keys and values are: - * model_config: A JSON string containing the model configuration - * model_instance_kind: A string containing model instance kind - * model_instance_device_id: A string containing model instance device ID - * model_repository: Model repository path - * model_version: Model version - * model_name: Model name - """ - # Parse model configs - model_config = json.loads(args["model_config"]) - tokenizer_dir = model_config["parameters"]["tokenizer_dir"]["string_value"] - - add_special_tokens = model_config["parameters"].get("add_special_tokens") - if add_special_tokens is not None: - add_special_tokens_str = add_special_tokens["string_value"].lower() - if add_special_tokens_str in [ - "true", - "false", - "1", - "0", - "t", - "f", - "y", - "n", - "yes", - "no", - ]: - self.add_special_tokens = add_special_tokens_str in [ - "true", - "1", - "t", - "y", - "yes", - ] - else: - print( - f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens' correctly (set value is {add_special_tokens['string_value']}). Set it as True by default." - ) - self.add_special_tokens = True - else: - print( - f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens'. Set it as True by default." - ) - self.add_special_tokens = True - - self.tokenizer = AutoTokenizer.from_pretrained( - tokenizer_dir, legacy=False, padding_side="left", trust_remote_code=True - ) - if isinstance(self.tokenizer, T5Tokenizer): - self.tokenizer_bos_id = self.tokenizer.sp_model.bos_id() - - if not self.tokenizer.pad_token: - self.tokenizer.pad_token = self.tokenizer.eos_token - - self.tokenizer_end_id = self.tokenizer.encode( - self.tokenizer.eos_token, add_special_tokens=False - )[0] - self.tokenizer_pad_id = self.tokenizer.encode( - self.tokenizer.pad_token, add_special_tokens=False - )[0] - - # Parse model output configs and convert Triton types to numpy types - output_names = [ - "INPUT_ID", - "DECODER_INPUT_ID", - "REQUEST_INPUT_LEN", - "REQUEST_DECODER_INPUT_LEN", - "BAD_WORDS_IDS", - "STOP_WORDS_IDS", - "OUT_END_ID", - "OUT_PAD_ID", - ] - input_names = ["EMBEDDING_BIAS_WORDS", "EMBEDDING_BIAS_WEIGHTS"] - for input_name in input_names: - setattr( - self, - input_name.lower() + "_dtype", - pb_utils.triton_string_to_numpy( - pb_utils.get_input_config_by_name(model_config, input_name)[ - "data_type" - ] - ), - ) - - for output_name in output_names: - setattr( - self, - output_name.lower() + "_dtype", - pb_utils.triton_string_to_numpy( - pb_utils.get_output_config_by_name(model_config, output_name)[ - "data_type" - ] - ), - ) - - def execute(self, requests): - """`execute` must be implemented in every Python model. `execute` - function receives a list of pb_utils.InferenceRequest as the only - argument. This function is called when an inference is requested - for this model. Depending on the batching configuration (e.g. Dynamic - Batching) used, `requests` may contain multiple requests. Every - Python model, must create one pb_utils.InferenceResponse for every - pb_utils.InferenceRequest in `requests`. If there is an error, you can - set the error argument when creating a pb_utils.InferenceResponse. - Parameters - ---------- - requests : list - A list of pb_utils.InferenceRequest - Returns - ------- - list - A list of pb_utils.InferenceResponse. The length of this list must - be the same as `requests` - """ - - responses = [] - - # Every Python backend must iterate over everyone of the requests - # and create a pb_utils.InferenceResponse for each of them. - logger = pb_utils.Logger - for idx, request in enumerate(requests): - # Get input tensors - query = pb_utils.get_input_tensor_by_name(request, "QUERY").as_numpy() - decoder_query = pb_utils.get_input_tensor_by_name(request, "DECODER_QUERY") - if decoder_query is not None: - decoder_query = decoder_query.as_numpy() - - batch_dim = query.shape[0] - if batch_dim != 1: - err_str = ( - "Inflight batching backend expects requests with batch size of 1." - ) - logger.log_error(err_str) - responses.append( - pb_utils.InferenceResponse( - output_tensors=[], error=pb_utils.TritonError(err_str) - ) - ) - continue - - request_output_len = pb_utils.get_input_tensor_by_name( - request, "REQUEST_OUTPUT_LEN" - ).as_numpy() - - bad_words_dict = pb_utils.get_input_tensor_by_name( - request, "BAD_WORDS_DICT" - ) - if bad_words_dict is not None: - bad_words_dict = bad_words_dict.as_numpy() - - stop_words_dict = pb_utils.get_input_tensor_by_name( - request, "STOP_WORDS_DICT" - ) - if stop_words_dict is not None: - stop_words_dict = stop_words_dict.as_numpy() - - embedding_bias_words = pb_utils.get_input_tensor_by_name( - request, "EMBEDDING_BIAS_WORDS" - ) - if embedding_bias_words is not None: - embedding_bias_words = embedding_bias_words.as_numpy() - - embedding_bias_weights = pb_utils.get_input_tensor_by_name( - request, "EMBEDDING_BIAS_WEIGHTS" - ) - if embedding_bias_weights is not None: - embedding_bias_weights = embedding_bias_weights.as_numpy() - - # Take the end_id from the input tensors - # If not specified, use tokenizer to get end_id - end_id = pb_utils.get_input_tensor_by_name(request, "END_ID") - if end_id is not None: - end_id = end_id.as_numpy() - else: - end_id = [[self.tokenizer_end_id]] - - # Take the pad_id from the input tensors - # If not specified, use tokenizer to get pad_id - pad_id = pb_utils.get_input_tensor_by_name(request, "PAD_ID") - if pad_id is not None: - pad_id = pad_id.as_numpy() - else: - pad_id = [[self.tokenizer_pad_id]] - - # Preprocessing input data. - input_id, request_input_len = self._create_request(query) - if decoder_query is not None: - decoder_input_id, request_decoder_input_len = self._create_request( - decoder_query - ) - else: - decoder_input_id = pad_id * np.ones((1, 1), np.int32) - request_decoder_input_len = 1 * np.ones((1, 1), np.int32) - - bad_words = self._to_word_list_format(bad_words_dict) - stop_words = self._to_word_list_format(stop_words_dict) - - embedding_bias = self._get_embedding_bias( - embedding_bias_words, - embedding_bias_weights, - self.embedding_bias_weights_dtype, - ) - - # Create output tensors. You need pb_utils.Tensor - # objects to create pb_utils.InferenceResponse. - input_id_tensor = pb_utils.Tensor( - "INPUT_ID", input_id.astype(self.input_id_dtype) - ) - request_input_len_tensor = pb_utils.Tensor( - "REQUEST_INPUT_LEN", - request_input_len.astype(self.request_input_len_dtype), - ) - decoder_input_id_tensor = pb_utils.Tensor( - "DECODER_INPUT_ID", decoder_input_id.astype(self.decoder_input_id_dtype) - ) - request_decoder_input_len_tensor = pb_utils.Tensor( - "REQUEST_DECODER_INPUT_LEN", - request_decoder_input_len.astype(self.request_decoder_input_len_dtype), - ) - request_output_len_tensor = pb_utils.Tensor( - "REQUEST_OUTPUT_LEN", request_output_len - ) - bad_words_ids_tensor = pb_utils.Tensor("BAD_WORDS_IDS", bad_words) - stop_words_ids_tensor = pb_utils.Tensor("STOP_WORDS_IDS", stop_words) - embedding_bias_tensor = pb_utils.Tensor("EMBEDDING_BIAS", embedding_bias) - end_id_tensor = pb_utils.Tensor( - "OUT_END_ID", np.array(end_id, dtype=np.int32) - ) - pad_id_tensor = pb_utils.Tensor( - "OUT_PAD_ID", np.array(pad_id, dtype=np.int32) - ) - - inference_response = pb_utils.InferenceResponse( - output_tensors=[ - input_id_tensor, - decoder_input_id_tensor, - bad_words_ids_tensor, - stop_words_ids_tensor, - request_input_len_tensor, - request_decoder_input_len_tensor, - request_output_len_tensor, - embedding_bias_tensor, - end_id_tensor, - pad_id_tensor, - ] - ) - responses.append(inference_response) - - # You should return a list of pb_utils.InferenceResponse. Length - # of this list must match the length of `requests` list. - return responses - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is optional. This function allows - the model to perform any necessary clean ups before exit. - """ - print("Cleaning up...") - - def _create_request(self, query): - """ - query : batch string (2D numpy array) - """ - if isinstance(self.tokenizer, T5Tokenizer): - start_ids = [ - np.array( - [self.tokenizer_bos_id] - + self.tokenizer.encode( - s[0].decode(), add_special_tokens=self.add_special_tokens - ) - ).astype(int) - for s in query - ] - else: - start_ids = [ - np.array( - self.tokenizer.encode( - s[0].decode(), add_special_tokens=self.add_special_tokens - ) - ).astype(int) - for s in query - ] - start_lengths = np.array([[len(ids)] for ids in start_ids]).astype(int) - - max_len = 0 - for seq in start_ids: - max_len = max(max_len, seq.shape[0]) - start_ids = np.stack( - [ - np.pad( - seq, - (0, max_len - seq.shape[0]), - "constant", - constant_values=(0, self.tokenizer_pad_id), - ) - for seq in start_ids - ] - ) - - return start_ids, start_lengths - - def _to_word_list_format(self, word_lists: List[List[str | bytes]]): - """ - word_lists format: - len(word_lists) == batch_size - word_lists[i] means the words associated to batch item i. A "word" may actually be any string. Like "lorem" or "lorem ipsum". - """ - assert self.tokenizer != None, "need to set tokenizer" - - if word_lists is None: - # Return an empty array of shape (1,2,0) - return np.empty([1, 2, 0], dtype="int32") - - flat_ids = [] - offsets = [] - for word_list in word_lists: - item_flat_ids = [] - item_offsets = [] - - for word in word_list: - if isinstance(word, bytes): - word = word.decode() - - ids = self.tokenizer.encode(word, add_special_tokens=False) - if len(ids) == 0: - continue - - item_flat_ids += ids - item_offsets.append(len(ids)) - - flat_ids.append(np.array(item_flat_ids)) - offsets.append(np.cumsum(np.array(item_offsets))) - - pad_to = max(1, max(len(ids) for ids in flat_ids)) - - for i, (ids, offs) in enumerate(zip(flat_ids, offsets)): - flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)), constant_values=0) - offsets[i] = np.pad(offs, (0, pad_to - len(offs)), constant_values=-1) - - return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2)) - - def _get_embedding_bias( - self, embedding_bias_words, embedding_bias_weights, bias_dtype - ): - assert self.tokenizer != None, "need to set tokenizer" - - if embedding_bias_words is None or embedding_bias_weights is None: - return np.empty([1, 0], dtype=self.embedding_bias_weights_dtype) - - batch_embedding_bias = [] - for words, weights in zip(embedding_bias_words, embedding_bias_weights): - vocab_size = self.tokenizer.vocab_size - embedding_bias = [0.0] * vocab_size - - assert len(words) == len( - weights - ), "Embedding bias words must have same dimension as embedding bias weights" - - for word, weight in zip(words, weights): - if isinstance(word, bytes): - word = word.decode() - ids = self.tokenizer.encode(word) - - if len(ids) == 0: - continue - - for id in ids: - embedding_bias[id] += weight - - batch_embedding_bias.append(np.array(embedding_bias)) - - return np.array(batch_embedding_bias, dtype=bias_dtype) diff --git a/python/openai/openai/tests/tensorrtllm_models/preprocessing/config.pbtxt b/python/openai/openai/tests/tensorrtllm_models/preprocessing/config.pbtxt deleted file mode 100644 index a262cf6983..0000000000 --- a/python/openai/openai/tests/tensorrtllm_models/preprocessing/config.pbtxt +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -name: "preprocessing" -backend: "python" -max_batch_size: 256 -input [ - { - name: "QUERY" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "DECODER_QUERY" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "REQUEST_OUTPUT_LEN" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "BAD_WORDS_DICT" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "STOP_WORDS_DICT" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "EMBEDDING_BIAS_WORDS" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "EMBEDDING_BIAS_WEIGHTS" - data_type: TYPE_FP32 - dims: [ -1 ] - optional: true - }, - { - name: "END_ID" - data_type: TYPE_INT32 - dims: [ -1 ] - optional: true - }, - { - name: "PAD_ID" - data_type: TYPE_INT32 - dims: [ -1 ] - optional: true - } -] -output [ - { - name: "INPUT_ID" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "REQUEST_INPUT_LEN" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "DECODER_INPUT_ID" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "REQUEST_DECODER_INPUT_LEN" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "BAD_WORDS_IDS" - data_type: TYPE_INT32 - dims: [ 2, -1 ] - }, - { - name: "STOP_WORDS_IDS" - data_type: TYPE_INT32 - dims: [ 2, -1 ] - }, - { - name: "EMBEDDING_BIAS" - data_type: TYPE_FP32 - dims: [ -1 ] - }, - { - name: "REQUEST_OUTPUT_LEN" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "OUT_END_ID" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "OUT_PAD_ID" - data_type: TYPE_INT32 - dims: [ -1 ] - } -] - -parameters { - key: "tokenizer_dir" - value: { - string_value: "/tmp/engines/llama-3-8b-instruct/hf_download" - } -} - -parameters { - key: "add_special_tokens" - value: { - string_value: "${add_special_tokens}" - } -} - -instance_group [ - { - count: 1 - kind: KIND_CPU - } -] diff --git a/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm/1/.gitkeep b/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm/1/.gitkeep deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm/1/model.py b/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm/1/model.py deleted file mode 100644 index 3425a20f57..0000000000 --- a/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm/1/model.py +++ /dev/null @@ -1,797 +0,0 @@ -import datetime -import json -import os -import time -from threading import Lock, Thread - -import numpy as np -import tensorrt_llm.bindings.executor as trtllm -import triton_python_backend_utils as pb_utils -from torch import from_numpy - - -def get_input_tensor_by_name(request, name): - tensor = pb_utils.get_input_tensor_by_name(request, name) - if tensor is None: - if name == "temperature": - print(f"Tensor for {name} is None!") - return None - return tensor.as_numpy() - - -def get_input_scalar_by_name(request, name): - tensor = get_input_tensor_by_name(request, name) - if tensor is None: - if name == "temperature": - print(f"Scalar for {name} is None!") - return None - if tensor.size != 1: - raise pb_utils.TritonModelException(f"Expected a single value for {name}") - return tensor.item() - - -def read_parameter_as_type(value, name, pytype=str): - if value == "": - return None - if value.startswith("${") and value.endswith("}"): - return None - if pytype is bool: - return value.lower() in ["1", "true"] - try: - result = pytype(value) - return result - except: - pb_utils.Logger.log_warning( - f"Could not read parameter '{name}' with value '{value}', will use default." - ) - return None - - -def get_parameter(model_config, name, pytype=str): - if name not in model_config["parameters"]: - return None - return read_parameter_as_type( - model_config["parameters"][name]["string_value"], name, pytype - ) - - -def convert_word_list(word_list): - if word_list is None: - return None - word_list = word_list.tolist() - if len(word_list) == 0 or len(word_list[0]) != 2: - raise pb_utils.TritonModelException(f"Invalid format for word list.") - words, indices = word_list[0] - result = [] - current_index = 0 - for i in indices: - if i == -1: - continue - if i > len(words): - raise pb_utils.TritonModelException(f"Invalid format for word list.") - current_word = [] - while current_index < i: - current_word.append(words[current_index]) - current_index += 1 - result.append(current_word) - return result - - -def parse_medusa_choices(medusa_choices): - if medusa_choices is None: - return None - try: - result = json.loads( - "[" + medusa_choices.replace("{", "[").replace("}", "]") + "]" - ) - assert isinstance(result, list) and len(result) > 0 - assert all([isinstance(x, list) for x in result]) - assert all([isinstance(y, int) for x in result for y in x]) - except Exception: - raise pb_utils.TritonModelException("Invalid format for medusa_choices") - return result - - -def get_sampling_config_from_request(request): - kwargs = {} - kwargs["beam_width"] = get_input_scalar_by_name(request, "beam_width") or 1 - kwargs["top_k"] = get_input_scalar_by_name(request, "runtime_top_k") - kwargs["top_p"] = get_input_scalar_by_name(request, "runtime_top_p") - kwargs["top_p"] = ( - None if kwargs["top_p"] is None or kwargs["top_p"] <= 0 else kwargs["top_p"] - ) - kwargs["random_seed"] = get_input_scalar_by_name(request, "random_seed") - kwargs["temperature"] = get_input_scalar_by_name(request, "temperature") - # print(f"=========== [DEBUG] [trtllm python runtime model.py] {kwargs['temperature']=} ==========") - kwargs["min_length"] = get_input_scalar_by_name(request, "min_length") - kwargs["repetition_penalty"] = get_input_scalar_by_name( - request, "repetition_penalty" - ) - kwargs["presence_penalty"] = get_input_scalar_by_name(request, "presence_penalty") - kwargs["frequency_penalty"] = get_input_scalar_by_name(request, "frequency_penalty") - kwargs["length_penalty"] = get_input_scalar_by_name(request, "len_penalty") - kwargs["top_p_min"] = get_input_scalar_by_name(request, "runtime_top_p_min") - kwargs["top_p_reset_ids"] = get_input_scalar_by_name( - request, "runtime_top_p_reset_ids" - ) - kwargs["top_p_decay"] = get_input_scalar_by_name(request, "runtime_top_p_decay") - kwargs["beam_search_diversity_rate"] = get_input_scalar_by_name( - request, "beam_search_diversity_rate" - ) - kwargs["early_stopping"] = get_input_scalar_by_name(request, "early_stopping") - kwargs = {k: v for k, v in kwargs.items() if v is not None} - return trtllm.SamplingConfig(**kwargs) - - -def get_output_config_from_request(request, exclude_input_from_output): - kwargs = {} - kwargs["return_log_probs"] = get_input_scalar_by_name(request, "return_log_probs") - kwargs["return_context_logits"] = get_input_scalar_by_name( - request, "return_context_logits" - ) - kwargs["return_generation_logits"] = get_input_scalar_by_name( - request, "return_generation_logits" - ) - kwargs["exclude_input_from_output"] = exclude_input_from_output - kwargs = {k: v for k, v in kwargs.items() if v is not None} - return trtllm.OutputConfig(**kwargs) - - -def get_external_draft_tokens_config_from_request(request): - kwargs = {} - draft_input_ids = get_input_tensor_by_name(request, "draft_input_ids") - if draft_input_ids is not None: - kwargs["tokens"] = draft_input_ids.tolist() - draft_logits = get_input_tensor_by_name(request, "draft_logits") - if draft_logits is not None: - kwargs["logits"] = from_numpy(draft_logits) - kwargs["acceptance_threshold"] = get_input_scalar_by_name( - request, "draft_acceptance_threshold" - ) - kwargs = {k: v for k, v in kwargs.items() if v is not None} - if len(kwargs) > 0: - return trtllm.ExternalDraftTokensConfig(**kwargs) - return None - - -def get_prompt_tuning_config_from_request(request): - # prompt_vocab_size is unused by executor. - kwargs = {} - prompt_embedding_table = get_input_tensor_by_name(request, "prompt_embedding_table") - if prompt_embedding_table is not None: - kwargs["embedding_table"] = from_numpy(prompt_embedding_table) - kwargs = {k: v for k, v in kwargs.items() if v is not None} - if len(kwargs) > 0: - return trtllm.PromptTuningConfig(**kwargs) - return None - - -def get_lora_config_from_request(request): - kwargs = {} - kwargs["task_id"] = get_input_scalar_by_name(request, "lora_task_id") - lora_weights = get_input_tensor_by_name(request, "lora_weights") - if lora_weights is not None: - kwargs["weights"] = from_numpy(lora_weights) - lora_config = get_input_tensor_by_name(request, "lora_config") - if lora_config is not None: - kwargs["config"] = from_numpy(lora_config) - kwargs = {k: v for k, v in kwargs.items() if v is not None} - if len(kwargs) > 0: - return trtllm.LoraConfig(**kwargs) - return None - - -def convert_request(request, exclude_input_from_output, decoupled): - inputs = {} - input_token_ids = get_input_tensor_by_name(request, "input_ids") - if input_token_ids is None: - raise pb_utils.TritonModelException("A value is required for input_ids") - input_token_ids = input_token_ids.tolist() - if len(input_token_ids) == 0: - raise pb_utils.TritonModelException(f"Invalid format for input_ids") - inputs["input_token_ids"] = input_token_ids[0] - # input_lengths is not not used by executor. - inputs["max_new_tokens"] = get_input_scalar_by_name(request, "request_output_len") - if inputs["max_new_tokens"] is None: - raise pb_utils.TritonModelException( - "A value is required for request_output_len" - ) - inputs["streaming"] = get_input_scalar_by_name(request, "streaming") - if inputs["streaming"] and not decoupled: - raise pb_utils.TritonModelException( - "Streaming is only supported in decoupled mode." - ) - inputs["end_id"] = get_input_scalar_by_name(request, "end_id") - inputs["pad_id"] = get_input_scalar_by_name(request, "pad_id") - inputs["stop_words"] = convert_word_list( - get_input_tensor_by_name(request, "stop_words_list") - ) - inputs["bad_words"] = convert_word_list( - get_input_tensor_by_name(request, "bad_words_list") - ) - embedding_bias = get_input_tensor_by_name(request, "embedding_bias") - if embedding_bias is not None and embedding_bias.size != 0: - inputs["embedding_bias"] = from_numpy(embedding_bias).squeeze() - - sampling_config = get_sampling_config_from_request(request) - output_config = get_output_config_from_request(request, exclude_input_from_output) - external_draft_tokens_config = get_external_draft_tokens_config_from_request( - request - ) - prompt_tuning_config = get_prompt_tuning_config_from_request(request) - lora_config = get_lora_config_from_request(request) - - return trtllm.Request( - **inputs, - sampling_config=sampling_config, - output_config=output_config, - external_draft_tokens_config=external_draft_tokens_config, - prompt_tuning_config=prompt_tuning_config, - lora_config=lora_config, - ) - - -def convert_response(response): - if response.has_error(): - return ( - pb_utils.InferenceResponse( - output_tensors=[], error=pb_utils.TritonError(response.error_msg) - ), - True, - ) - result = response.result - beam_lengths = np.expand_dims( - np.array([len(beam) for beam in result.output_token_ids], np.int32), 0 - ) - max_beam_length = max([len(beam) for beam in result.output_token_ids]) - output_ids = np.full( - (1, len(result.output_token_ids), max_beam_length), -1, np.int32 - ) - for idx, beam in enumerate(result.output_token_ids): - output_ids[0, idx, : len(beam)] = beam - output_tensors = [ - pb_utils.Tensor("output_ids", output_ids), - pb_utils.Tensor("sequence_length", beam_lengths), - ] - output_tensors.append( - pb_utils.Tensor( - "cum_log_probs", - np.expand_dims(np.array(result.cum_log_probs, np.float32), 0) - if result.cum_log_probs is not None - else np.zeros((1, 1), np.float32), - ) - ) - output_tensors.append( - pb_utils.Tensor( - "output_log_probs", - np.expand_dims(np.array(result.log_probs, np.float32), 0) - if result.log_probs is not None - else np.zeros((1, 1, 1), np.float32), - ) - ) - output_tensors.append( - pb_utils.Tensor( - "context_logits", - np.expand_dims(np.array(result.context_logits, np.float32), 0) - if result.context_logits is not None - else np.zeros((1, 1, 1), np.float32), - ) - ) - output_tensors.append( - pb_utils.Tensor( - "generation_logits", - np.expand_dims(np.array(result.generation_logits, np.float32), 0) - if result.generation_logits is not None - else np.zeros((1, 1, 1, 1), np.float32), - ) - ) - return pb_utils.InferenceResponse(output_tensors), result.is_final - - -def convert_scheduler_policy(batch_scheduler_policy: str): - if batch_scheduler_policy.lower() == "max_utilization": - return trtllm.CapacitySchedulerPolicy.MAX_UTILIZATION - elif batch_scheduler_policy.lower() == "guaranteed_no_evict": - return trtllm.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT - raise pb_utils.TritonModelException( - f"batch_scheduler_policy value of '{batch_scheduler_policy}' is not supported." - ) - - -def convert_batching_type(gpt_model_type: str): - if gpt_model_type is None: - return None - if ( - gpt_model_type.lower() == "inflight_fused_batching" - or gpt_model_type.lower() == "inflight_batching" - ): - return trtllm.BatchingType.INFLIGHT - elif gpt_model_type.lower() == "v1": - return trtllm.BatchingType.STATIC - raise pb_utils.TritonModelException( - f"gpt_model_type value of '{gpt_model_type}' is not supported." - ) - - -def convert_decoding_mode(decoding_mode: str): - if decoding_mode is None: - return None - elif decoding_mode == "auto": - return trtllm.DecodingMode.Auto() - elif decoding_mode == "top_k": - return trtllm.DecodingMode.TopK() - elif decoding_mode == "top_p": - return trtllm.DecodingMode.TopP() - elif decoding_mode == "top_k_top_p": - return trtllm.DecodingMode.TopKTopP() - elif decoding_mode == "beam_search": - return trtllm.DecodingMode.BeamSearch() - elif decoding_mode == "medusa": - return trtllm.DecodingMode.Medusa() - raise pb_utils.TritonModelException( - f"decoding_mode value of '{decoding_mode}' is not supported." - ) - - -def convert_timestamp_to_seconds(timestamp: str): - return int(datetime.datetime.strptime(timestamp, "%m-%d-%Y %H:%M:%S").timestamp()) - - -class TritonPythonModel: - """Your Python model must use the same class name. Every Python model - that is created must have "TritonPythonModel" as the class name. - """ - - def get_scheduler_config(self, model_config): - batch_scheduler_policy = get_parameter(model_config, "batch_scheduler_policy") - if batch_scheduler_policy is None: - return trtllm.SchedulerConfig() - return trtllm.SchedulerConfig(convert_scheduler_policy(batch_scheduler_policy)) - - def get_kv_cache_config(self, model_config): - kwargs = { - "enable_block_reuse": get_parameter( - model_config, "enable_kv_cache_reuse", bool - ), - "max_tokens": get_parameter( - model_config, "max_tokens_in_paged_kv_cache", int - ), - "sink_token_length": get_parameter(model_config, "sink_token_length", int), - "max_attention_window": get_parameter( - model_config, "max_attention_window_size", int - ), - "free_gpu_memory_fraction": get_parameter( - model_config, "kv_cache_free_gpu_mem_fraction", float - ), - "host_cache_size": get_parameter( - model_config, "kv_cache_host_memory_bytes", int - ), - "onboard_blocks": get_parameter( - model_config, "kv_cache_onboard_blocks", bool - ), - } - kwargs = {k: v for k, v in kwargs.items() if v is not None} - return trtllm.KvCacheConfig(**kwargs) - - def get_parallel_config(self, model_config): - kwargs = {} - gpu_device_ids = get_parameter(model_config, "gpu_device_ids") - if gpu_device_ids: - kwargs["device_ids"] = [int(x) for x in gpu_device_ids.split(",")] - self.use_orchestrator_mode = os.environ.get("TRTLLM_ORCHESTRATOR", "0") == "1" - if self.use_orchestrator_mode: - kwargs["communication_mode"] = trtllm.CommunicationMode.ORCHESTRATOR - worker_path = get_parameter(model_config, "worker_path") - if worker_path is not None: - raise pb_utils.TritonModelException( - "worker_path parameter is specified, but this is no longer supported. Please specify executor_worker_path instead to specify the location of the trtllmExecutorWorker executable." - ) - executor_worker_path = get_parameter(model_config, "executor_worker_path") - kwargs["orchestrator_config"] = trtllm.OrchestratorConfig( - True, executor_worker_path - ) - if len(kwargs) > 0: - return trtllm.ParallelConfig(**kwargs) - return None - - def get_peft_cache_config(self, model_config): - kwargs = { - "optimal_adapter_size": get_parameter( - model_config, "lora_cache_optimal_adapter_size", int - ), - "max_adapter_size": get_parameter( - model_config, "lora_cache_max_adapter_size", int - ), - "device_cache_percent": get_parameter( - model_config, "lora_cache_gpu_memory_fraction", float - ), - "host_cache_size": get_parameter( - model_config, "lora_cache_host_memory_bytes", int - ), - } - kwargs = {k: v for k, v in kwargs.items() if v is not None} - return trtllm.PeftCacheConfig(**kwargs) - - def get_decoding_config(self, model_config): - kwargs = { - "medusa_choices": parse_medusa_choices( - get_parameter(model_config, "medusa_choices") - ), - "decoding_mode": convert_decoding_mode( - get_parameter(model_config, "decoding_mode") - ), - } - print(kwargs) - kwargs = {k: v for k, v in kwargs.items() if v is not None} - return trtllm.DecodingConfig(**kwargs) - - def get_executor_config(self, model_config): - kwargs = { - "max_beam_width": get_parameter(model_config, "max_beam_width", int), - "scheduler_config": self.get_scheduler_config(model_config), - "kv_cache_config": self.get_kv_cache_config(model_config), - "enable_chunked_context": get_parameter( - model_config, "enable_chunked_context", bool - ), - "normalize_log_probs": get_parameter( - model_config, "normalize_log_probs", bool - ), - "batching_type": convert_batching_type( - get_parameter(model_config, "gpt_model_type") - ), - "parallel_config": self.get_parallel_config(model_config), - "peft_cache_config": self.get_peft_cache_config(model_config), - "decoding_config": self.get_decoding_config(model_config), - } - kwargs = {k: v for k, v in kwargs.items() if v is not None} - return trtllm.ExecutorConfig(**kwargs) - - def create_metrics(self, model: str, version: str, is_v1_model: bool): - self.request_metric_family = pb_utils.MetricFamily( - name="nv_trt_llm_request_metrics", - description="TRT LLM request metrics", - kind=pb_utils.MetricFamily.GAUGE, - ) - self.runtime_memory_metric_family = pb_utils.MetricFamily( - name="nv_trt_llm_runtime_memory_metrics", - description="TRT LLM runtime memory metrics", - kind=pb_utils.MetricFamily.GAUGE, - ) - self.kv_cache_metric_family = pb_utils.MetricFamily( - name="nv_trt_llm_kv_cache_block_metrics", - description="TRT LLM KV cache block metrics", - kind=pb_utils.MetricFamily.GAUGE, - ) - model_type = "v1" if is_v1_model else "inflight_batcher" - self.model_type_metric_family = pb_utils.MetricFamily( - name=f"nv_trt_llm_{model_type}_metrics", - description=f"TRT LLM {model_type}-specific metrics", - kind=pb_utils.MetricFamily.GAUGE, - ) - self.general_metric_family = pb_utils.MetricFamily( - name="nv_trt_llm_general_metrics", - description="General TRT LLM metrics", - kind=pb_utils.MetricFamily.GAUGE, - ) - common_labels = {"model": model, "version": version} - self.all_metrics = { - # Request metrics - "num_active_requests": self.request_metric_family.Metric( - labels={"request_type": "active", **common_labels} - ), - "max_num_active_requests": self.request_metric_family.Metric( - labels={"request_type": "max", **common_labels} - ), - "num_scheduled_requests": self.request_metric_family.Metric( - labels={"request_type": "scheduled", **common_labels} - ), - "num_context_requests": self.request_metric_family.Metric( - labels={"request_type": "context", **common_labels} - ), - # Runtime metrics - "cpu_mem_usage": self.runtime_memory_metric_family.Metric( - labels={"memory_type": "cpu", **common_labels} - ), - "gpu_mem_usage": self.runtime_memory_metric_family.Metric( - labels={"memory_type": "gpu", **common_labels} - ), - "pinned_mem_usage": self.runtime_memory_metric_family.Metric( - labels={"memory_type": "pinned", **common_labels} - ), - # KV cache metrics - "max_num_blocks": self.kv_cache_metric_family.Metric( - labels={"kv_cache_block_type": "max", **common_labels} - ), - "free_num_blocks": self.kv_cache_metric_family.Metric( - labels={"kv_cache_block_type": "free", **common_labels} - ), - "used_num_blocks": self.kv_cache_metric_family.Metric( - labels={"kv_cache_block_type": "used", **common_labels} - ), - "tokens_per_block": self.kv_cache_metric_family.Metric( - labels={"kv_cache_block_type": "tokens_per", **common_labels} - ), - # General metrics - "timestamp": self.general_metric_family.Metric( - labels={"general_type": "timestamp", **common_labels} - ), - "iter": self.general_metric_family.Metric( - labels={"general_type": "iteration_counter", **common_labels} - ), - } - if is_v1_model: - self.all_metrics.update( - { - "num_ctx_tokens": self.model_type_metric_family.Metric( - labels={ - "v1_specific_metric": "total_context_tokens", - **common_labels, - } - ), - "num_gen_tokens": self.model_type_metric_family.Metric( - labels={ - "v1_specific_metric": "total_generation_tokens", - **common_labels, - } - ), - "empty_gen_slots": self.model_type_metric_family.Metric( - labels={ - "v1_specific_metric": "empty_generation_slots", - **common_labels, - } - ), - } - ) - else: - self.all_metrics.update( - { - "num_ctx_tokens": self.model_type_metric_family.Metric( - labels={ - "inflight_batcher_specific_metric": "total_context_tokens", - **common_labels, - } - ), - "num_gen_requests": self.model_type_metric_family.Metric( - labels={ - "inflight_batcher_specific_metric": "generation_requests", - **common_labels, - } - ), - "micro_batch_id": self.model_type_metric_family.Metric( - labels={ - "inflight_batcher_specific_metric": "micro_batch_id", - **common_labels, - } - ), - "num_paused_requests": self.model_type_metric_family.Metric( - labels={ - "inflight_batcher_specific_metric": "paused_requests", - **common_labels, - } - ), - } - ) - - def initialize(self, args): - """`initialize` is called only once when the model is being loaded. - Implementing `initialize` function is optional. This function allows - the model to initialize any state associated with this model. - - Parameters - ---------- - args : dict - Both keys and values are strings. The dictionary keys and values are: - * model_config: A JSON string containing the model configuration - * model_instance_kind: A string containing model instance kind - * model_instance_device_id: A string containing model instance device ID - * model_repository: Model repository path - * model_version: Model version - * model_name: Model name - """ - model_config = json.loads(args["model_config"]) - gpt_model_path = get_parameter(model_config, "gpt_model_path") - if get_parameter(model_config, "enable_trt_overlap", bool): - raise pb_utils.TritonModelException( - f"enable_trt_overlap=true is not supported." - ) - self.exclude_input_from_output = get_parameter( - model_config, "exclude_input_in_output", bool - ) - executor_config = self.get_executor_config(model_config) - self.executor = trtllm.Executor( - gpt_model_path, trtllm.ModelType.DECODER_ONLY, executor_config - ) - self.decoupled = pb_utils.using_decoupled_model_transaction_policy(model_config) - self.cancellation_check_period_ms = ( - get_parameter(model_config, "cancellation_check_period_ms", int) or 100 - ) - self.stats_check_period_ms = ( - get_parameter(model_config, "stats_check_period_ms", int) or 100 - ) - - if not self.decoupled: - raise pb_utils.TritonModelException( - "Please enable decoupled transaction policy in the model configuration to serve this model" - ) - - self.create_metrics( - args["model_name"], - args["model_version"], - is_v1_model=executor_config.batching_type == trtllm.BatchingType.STATIC, - ) - self.triton_id_to_req_id = {} - self.req_id_to_response_sender = {} - self.lock = Lock() - self.running = False - self.awaiter_thread = Thread(target=self.awaiter_loop) - self.cancellation_thread = Thread(target=self.cancellation_loop) - self.metrics_thread = Thread(target=self.metrics_loop) - if self.executor.can_enqueue_requests(): - self.running = True - self.awaiter_thread.start() - self.cancellation_thread.start() - self.metrics_thread.start() - else: - # In leader mode, worker ranks will wait here until leader is done. - self.executor.shutdown() - - def handle_stop_request(self, triton_id, response_sender): - if triton_id is None or triton_id == "": - response_sender.send( - pb_utils.InferenceResponse( - error=pb_utils.TritonError( - "A request id must be provided for request cancellation" - ) - ), - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL, - ) - return - - if triton_id in self.triton_id_to_req_id: - req_id = self.triton_id_to_req_id[triton_id] - self.executor.cancel_request(req_id) - - response_sender.send( - pb_utils.InferenceResponse(), - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL, - ) - - def execute(self, requests): - """`execute` must be implemented in every Python model. `execute` - function receives a list of pb_utils.InferenceRequest as the only - argument. This function is called when an inference is requested - for this model. - - Parameters - ---------- - requests : list - A list of pb_utils.InferenceRequest - - Returns - ------- - list - A list of pb_utils.InferenceResponse. The length of this list must - be the same as `requests` - """ - if not self.executor.can_enqueue_requests(): - return - - # Convert to executor requests. - triton_requests = [] - executor_requests = [] - for request in requests: - response_sender = request.get_response_sender() - if get_input_scalar_by_name(request, "stop"): - self.handle_stop_request(request.request_id(), response_sender) - else: - try: - converted = convert_request( - request, self.exclude_input_from_output, self.decoupled - ) - except Exception as e: - response_sender.send( - pb_utils.InferenceResponse( - error=pb_utils.TritonError( - f"An error occurred when processing the input values for request id {request.request_id()}, the error was '{e}'" - ) - ), - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL, - ) - else: - triton_requests.append(request) - executor_requests.append(converted) - - with self.lock: - request_ids = self.executor.enqueue_requests(executor_requests) - for req_id, request in zip(request_ids, triton_requests): - triton_id = request.request_id() - self.req_id_to_response_sender[req_id] = ( - triton_id, - request.get_response_sender(), - ) - self.triton_id_to_req_id[triton_id] = req_id - return None - - def awaiter_loop(self): - """Gets responses from executor and returns the results.""" - while self.running: - for response in self.executor.await_responses( - timeout=datetime.timedelta(milliseconds=1) - ): - req_id = response.request_id - with self.lock: - if req_id not in self.req_id_to_response_sender: - continue - triton_id, response_sender = self.req_id_to_response_sender[req_id] - - triton_response, is_final = convert_response(response) - response_sender.send( - triton_response, - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL - if is_final - else 0, - ) - - if is_final: - with self.lock: - del self.triton_id_to_req_id[triton_id] - del self.req_id_to_response_sender[req_id] - # Remove local reference so response_sender can be cleaned properly. - del response_sender - - def cancellation_loop(self): - """Checks if any pending requests have been cancelled.""" - while self.running: - time.sleep(self.cancellation_check_period_ms / 1000.0) - with self.lock: - for req_id, ( - triton_id, - response_sender, - ) in self.req_id_to_response_sender.items(): - if response_sender.is_cancelled(): - self.executor.cancel_request(req_id) - # Remove local reference so response_sender can be cleaned properly. - del response_sender - - def metrics_loop(self): - """Updates triton metrics using stats from the executor.""" - while self.running: - time.sleep(self.stats_check_period_ms / 1000.0) - for stat in self.executor.get_latest_iteration_stats(): - try: - for key, metric in self.all_metrics.items(): - value = None - if hasattr(stat, key): - value = getattr(stat, key) - elif stat.kv_cache_stats is not None and hasattr( - stat.kv_cache_stats, key - ): - value = getattr(stat.kv_cache_stats, key) - elif stat.static_batching_stats is not None and hasattr( - stat.static_batching_stats, key - ): - value = getattr(stat.static_batching_stats, key) - elif stat.inflight_batching_stats is not None and hasattr( - stat.inflight_batching_stats, key - ): - value = getattr(stat.inflight_batching_stats, key) - if value is not None: - if key == "timestamp": - value = convert_timestamp_to_seconds(value) - metric.set(value) - else: - pb_utils.Logger.log_warn(f'Metric "{key}" not found.') - except Exception as e: - pb_utils.Logger.log_warn(f"Error while processing metrics: {e}") - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is optional. This function allows - the model to perform any necessary clean ups before exit. - """ - if self.executor.can_enqueue_requests(): - self.running = False - self.awaiter_thread.join() - self.cancellation_thread.join() - self.metrics_thread.join() - self.executor.shutdown() diff --git a/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm/config.pbtxt b/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm/config.pbtxt deleted file mode 100644 index 7c9f294b89..0000000000 --- a/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm/config.pbtxt +++ /dev/null @@ -1,542 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -name: "tensorrt_llm" -backend: "tensorrtllm" -#backend: "python" -max_batch_size: 256 - -model_transaction_policy { - decoupled: True -} - -dynamic_batching { - preferred_batch_size: [ 256 ] - max_queue_delay_microseconds: 1000 -} - -input [ - { - name: "input_ids" - data_type: TYPE_INT32 - dims: [ -1 ] - allow_ragged_batch: true - }, - { - name: "input_lengths" - data_type: TYPE_INT32 - dims: [ 1 ] - reshape: { shape: [ ] } - }, - { - name: "request_output_len" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "draft_input_ids" - data_type: TYPE_INT32 - dims: [ -1 ] - optional: true - allow_ragged_batch: true - }, - { - name: "decoder_input_ids" - data_type: TYPE_INT32 - dims: [ -1 ] - optional: true - allow_ragged_batch: true - }, - { - name: "decoder_input_lengths" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - reshape: { shape: [ ] } - }, - { - name: "draft_logits" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - optional: true - allow_ragged_batch: true - }, - { - name: "draft_acceptance_threshold" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "end_id" - data_type: TYPE_INT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "pad_id" - data_type: TYPE_INT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "stop_words_list" - data_type: TYPE_INT32 - dims: [ 2, -1 ] - optional: true - allow_ragged_batch: true - }, - { - name: "bad_words_list" - data_type: TYPE_INT32 - dims: [ 2, -1 ] - optional: true - allow_ragged_batch: true - }, - { - name: "embedding_bias" - data_type: TYPE_FP32 - dims: [ -1 ] - optional: true - allow_ragged_batch: true - }, - { - name: "beam_width" - data_type: TYPE_INT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "temperature" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "runtime_top_k" - data_type: TYPE_INT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "runtime_top_p" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "runtime_top_p_min" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "runtime_top_p_decay" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "runtime_top_p_reset_ids" - data_type: TYPE_INT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "len_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "early_stopping" - data_type: TYPE_BOOL - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "repetition_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "min_length" - data_type: TYPE_INT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "beam_search_diversity_rate" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "presence_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "frequency_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "random_seed" - data_type: TYPE_UINT64 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "return_log_probs" - data_type: TYPE_BOOL - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "return_context_logits" - data_type: TYPE_BOOL - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "return_generation_logits" - data_type: TYPE_BOOL - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "stop" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "streaming" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "prompt_embedding_table" - data_type: TYPE_FP16 - dims: [ -1, -1 ] - optional: true - allow_ragged_batch: true - }, - { - name: "prompt_vocab_size" - data_type: TYPE_INT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - # the unique task ID for the given LoRA. - # To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given. - # The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`. - # If the cache is full the oldest LoRA will be evicted to make space for new ones. An error is returned if `lora_task_id` is not cached. - { - name: "lora_task_id" - data_type: TYPE_UINT64 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - # weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ] - # where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer - # each of the in / out tensors are first flattened and then concatenated together in the format above. - # D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out. - { - name: "lora_weights" - data_type: TYPE_FP16 - dims: [ -1, -1 ] - optional: true - allow_ragged_batch: true - }, - # module identifier (same size a first dimension of lora_weights) - # See LoraModule::ModuleType for model id mapping - # - # "attn_qkv": 0 # compbined qkv adapter - # "attn_q": 1 # q adapter - # "attn_k": 2 # k adapter - # "attn_v": 3 # v adapter - # "attn_dense": 4 # adapter for the dense layer in attention - # "mlp_h_to_4h": 5 # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection - # "mlp_4h_to_h": 6 # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection - # "mlp_gate": 7 # for llama2 adapter for gated mlp later after attention / RMSNorm: gate - # - # last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ] - { - name: "lora_config" - data_type: TYPE_INT32 - dims: [ -1, 3 ] - optional: true - allow_ragged_batch: true - } -] -output [ - { - name: "output_ids" - data_type: TYPE_INT32 - dims: [ -1, -1 ] - }, - { - name: "sequence_length" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "cum_log_probs" - data_type: TYPE_FP32 - dims: [ -1 ] - }, - { - name: "output_log_probs" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - }, - { - name: "context_logits" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - }, - { - name: "generation_logits" - data_type: TYPE_FP32 - dims: [ -1, -1, -1 ] - } -] -instance_group [ - { - count: 1 - kind : KIND_CPU - } -] -parameters: { - key: "max_beam_width" - value: { - string_value: "${max_beam_width}" - } -} -parameters: { - key: "FORCE_CPU_ONLY_INPUT_TENSORS" - value: { - string_value: "no" - } -} -parameters: { - key: "gpt_model_type" - value: { - string_value: "inflight_fused_batching" - } -} -parameters: { - key: "gpt_model_path" - value: { - string_value: "/tmp/engines/llama-3-8b-instruct" - } -} -parameters: { - key: "encoder_model_path" - value: { - string_value: "${encoder_engine_dir}" - } -} -parameters: { - key: "max_tokens_in_paged_kv_cache" - value: { - string_value: "${max_tokens_in_paged_kv_cache}" - } -} -parameters: { - key: "max_attention_window_size" - value: { - string_value: "${max_attention_window_size}" - } -} -parameters: { - key: "sink_token_length" - value: { - string_value: "${sink_token_length}" - } -} -parameters: { - key: "batch_scheduler_policy" - value: { - string_value: "${batch_scheduler_policy}" - } -} -parameters: { - key: "kv_cache_free_gpu_mem_fraction" - value: { - string_value: "${kv_cache_free_gpu_mem_fraction}" - } -} -parameters: { - key: "kv_cache_host_memory_bytes" - value: { - string_value: "${kv_cache_host_memory_bytes}" - } -} -parameters: { - key: "kv_cache_onboard_blocks" - value: { - string_value: "${kv_cache_onboard_blocks}" - } -} -# enable_trt_overlap is deprecated and doesn't have any effect on the runtime -# parameters: { -# key: "enable_trt_overlap" -# value: { -# string_value: "${enable_trt_overlap}" -# } -# } -parameters: { - key: "exclude_input_in_output" - value: { - string_value: "True" - } -} -parameters: { - key: "cancellation_check_period_ms" - value: { - string_value: "${cancellation_check_period_ms}" - } -} -parameters: { - key: "stats_check_period_ms" - value: { - string_value: "${stats_check_period_ms}" - } -} -parameters: { - key: "iter_stats_max_iterations" - value: { - string_value: "${iter_stats_max_iterations}" - } -} -parameters: { - key: "request_stats_max_iterations" - value: { - string_value: "${request_stats_max_iterations}" - } -} -parameters: { - key: "enable_kv_cache_reuse" - value: { - string_value: "${enable_kv_cache_reuse}" - } -} -parameters: { - key: "normalize_log_probs" - value: { - string_value: "${normalize_log_probs}" - } -} -parameters: { - key: "enable_chunked_context" - value: { - string_value: "${enable_chunked_context}" - } -} -parameters: { - key: "gpu_device_ids" - value: { - string_value: "${gpu_device_ids}" - } -} -parameters: { - key: "lora_cache_optimal_adapter_size" - value: { - string_value: "${lora_cache_optimal_adapter_size}" - } -} -parameters: { - key: "lora_cache_max_adapter_size" - value: { - string_value: "${lora_cache_max_adapter_size}" - } -} -parameters: { - key: "lora_cache_gpu_memory_fraction" - value: { - string_value: "${lora_cache_gpu_memory_fraction}" - } -} -parameters: { - key: "lora_cache_host_memory_bytes" - value: { - string_value: "${lora_cache_host_memory_bytes}" - } -} -parameters: { - key: "decoding_mode" - value: { - string_value: "${decoding_mode}" - } -} -parameters: { - key: "executor_worker_path" - value: { - string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker" - } -} -parameters: { - key: "medusa_choices" - value: { - string_value: "${medusa_choices}" - } -} -parameters: { - key: "gpu_weights_percent" - value: { - string_value: "${gpu_weights_percent}" - } -} diff --git a/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/decode.py b/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/decode.py deleted file mode 100644 index c621cc14b4..0000000000 --- a/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/decode.py +++ /dev/null @@ -1,347 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from collections.abc import Generator -from dataclasses import dataclass -from typing import Optional - -import numpy as np - - -class RequestValidationError(Exception): - pass - - -def _validate_that(condition: bool, msg: str): - if not condition: - raise RequestValidationError(msg) - - -def _validate_non_empty(data, msg: str): - _validate_that(data is not None and data.size > 0, msg) - - -def _validate_single_gt_0(data, msg: str): - _validate_non_empty(data, msg) - _validate_that(data.flatten()[0] > 0, msg) - - -def _single_value(data: Optional[np.ndarray]): - if data is None: - return None - return data.flatten()[0] - - -@dataclass -class Request: - text_input: np.ndarray = np.array([]) - decoder_text_input: np.ndarray = None - max_tokens: np.ndarray = np.array([]) - bad_words: Optional[np.ndarray] = None - stop_words: Optional[np.ndarray] = None - end_id: Optional[np.ndarray] = None - pad_id: Optional[np.ndarray] = None - top_k: Optional[np.ndarray] = None - top_p: Optional[np.ndarray] = None - temperature: Optional[np.ndarray] = None - length_penalty: Optional[np.ndarray] = None - repetition_penalty: Optional[np.ndarray] = None - min_length: Optional[np.ndarray] = None - return_log_probs: Optional[np.ndarray] = None - prompt_embedding_table: Optional[np.ndarray] = None - prompt_vocab_size: Optional[np.ndarray] = None - embedding_bias_words: Optional[np.ndarray] = None - embedding_bias_weights: Optional[np.ndarray] = None - num_draft_tokens: Optional[np.ndarray] = None - use_draft_logits: Optional[np.ndarray] = None - stream: Optional[np.ndarray] = None - beam_width: Optional[np.ndarray] = None - return_context_logits: Optional[np.ndarray] = None - return_generation_logits: Optional[np.ndarray] = None - random_seed: Optional[np.ndarray] = None - presence_penalty: Optional[np.ndarray] = None - frequency_penalty: Optional[np.ndarray] = None - - def validate(self): - _validate_non_empty(self.text_input, "text_input is required") - _validate_single_gt_0(self.max_tokens, "max_tokens must be a single value > 0") - - num_draft_tokens = _single_value(self.num_draft_tokens) - stream = _single_value(self.stream) - _single_value(self.return_generation_logits) - context_logits = _single_value(self.return_context_logits) - - if num_draft_tokens: - _validate_that( - not stream, "streaming is not supported with speculative decoding" - ) - _validate_that( - not context_logits, - "context logits are not supported with speculative decoding", - ) - - -@dataclass -class DraftRequest: - draft_input_ids: Optional[np.ndarray] = None - draft_logits: Optional[np.ndarray] = None - - -@dataclass -class PreprocResponse: - input_ids: np.ndarray = np.array([]) - decoder_input_ids: np.ndarray = None - input_lengths: np.ndarray = np.array([]) - decoder_input_lengths: np.ndarray = None - bad_words_list: Optional[np.ndarray] = None - stop_words_list: Optional[np.ndarray] = None - embedding_bias: Optional[np.ndarray] = None - end_id: Optional[np.ndarray] = None - pad_id: Optional[np.ndarray] = None - - @classmethod - def with_new_inputs( - cls, - other, - input_ids: Optional[np.ndarray] = None, - input_lengths: Optional[np.ndarray] = None, - ): - return cls( - input_ids=(input_ids if input_ids is not None else other.input_ids), - input_lengths=( - input_lengths if input_lengths is not None else other.input_lengths - ), - decoder_input_ids=other.decoder_input_ids, - decoder_input_lengths=other.decoder_input_lengths, - bad_words_list=other.bad_words_list, - stop_words_list=other.stop_words_list, - end_id=other.end_id, - pad_id=other.pad_id, - ) - - -@dataclass -class GenerationResponse: - output_ids: np.ndarray = np.array([]) - sequence_length: np.ndarray = np.array([]) - cum_log_probs: Optional[np.ndarray] = None - output_log_probs: Optional[np.ndarray] = None - context_logits: Optional[np.ndarray] = None - generation_logits: Optional[np.ndarray] = None - - -@dataclass -class Response: - text_output: np.ndarray = np.array([]) - cum_log_probs: Optional[np.ndarray] = None - output_log_probs: Optional[np.ndarray] = None - context_logits: Optional[np.ndarray] = None - generation_logits: Optional[np.ndarray] = None - - def __eq__(self, o) -> bool: - """Just for testing""" - if not isinstance(o, Response): - return False - return ( - np.array_equal(self.text_output, o.text_output) - and np.array_equal(self.cum_log_probs, o.cum_log_probs) - and np.array_equal(self.output_log_probs, o.output_log_probs) - and np.array_equal(self.context_logits, o.context_logits) - and np.array_equal(self.generation_logits, o.generation_logits) - ) - - -class Decoder: - def __init__(self, streaming=False, accumulate=False): - self._streaming = streaming - self._accumulate = accumulate - - self._accumulated_tokens = None - - def decode( - self, request: Request, speculative_decoding=False - ) -> Generator[Response, None, None]: - preproc_response = self.preprocess(request) - - # print(f"[DEBUG] Decoder.decode {request.temperature=}") - if speculative_decoding: - for gen_response in self._spec_generate(preproc_response, request): - yield self.postprocess(gen_response) - else: - if not self._streaming: - gen_response = self._generate_non_streaming(preproc_response, request) - yield self.postprocess(gen_response) - else: - for gen_response in self._generate(preproc_response, request): - yield self.postprocess(gen_response) - - def encountered_stop_words(self, input_ids, stop_words_ids): - for stop_word_ids in stop_words_ids: - if np.array_equal(input_ids[-len(stop_word_ids) :], stop_word_ids): - return True - return False - - def _spec_generate( - self, preproc: PreprocResponse, request: Request - ) -> Generator[GenerationResponse, None, None]: - prompt_input_ids: np.ndarray = preproc.input_ids[0] - input_ids: np.ndarray = prompt_input_ids - output_len: int = request.max_tokens[0][0] - last_input_ids: np.ndarray = None - draft_output_ids: np.ndarray = None - draft_logits: np.ndarray = None - - target_response: GenerationResponse = None - - cur_preproc = preproc - - counter = 0 - while True: - counter += 1 - num_draft_tokens = min( - request.num_draft_tokens[0][0], - len(prompt_input_ids) + output_len - len(input_ids) - 1, - ) - - draft_request = None - if num_draft_tokens > 0: - draft_response: GenerationResponse = self._draft_generate_non_streaming( - cur_preproc, request, num_draft_tokens - ) - seq_len: int = draft_response.sequence_length[0][0] - # [1, beamWidth, outputLength] -> [outputLen] - draft_output_ids = draft_response.output_ids[0][0] - # [1, beamWidth, outputLength, vocabSizePadded] -> [outputLength, vocabSizePadded] - if request.use_draft_logits is not None and request.use_draft_logits[0]: - if draft_response.generation_logits is not None: - draft_logits = draft_response.generation_logits[0][0] - - input_draft_tokens = draft_output_ids[len(input_ids) : seq_len] - draft_request = DraftRequest( - draft_input_ids=np.expand_dims(input_draft_tokens, 0) - ) - if request.use_draft_logits is not None and request.use_draft_logits[0]: - draft_request.draft_logits = np.expand_dims( - draft_logits[-len(input_draft_tokens) :], 0 - ) - else: - draft_request = DraftRequest() - target_response = self._generate_non_streaming( - cur_preproc, request, draft_request - ) - last_input_ids = input_ids - input_ids = target_response.output_ids[0][0] - cur_preproc = PreprocResponse.with_new_inputs( - cur_preproc, - np.expand_dims(input_ids, 0), - np.array([[len(input_ids)]], dtype=np.int32), - ) - - # Evaluate criteria to stop generation loop. - # If we've hit or exceeded the max output length, should stop - length_stop = len(input_ids) >= len(prompt_input_ids) + output_len - if length_stop: - break - # If draft and target have same outputs, should stop. Normally target should return 1 more token. - # If they are the same length, they should differ at the last token - target_draft_equal = draft_output_ids is not None and np.array_equal( - draft_output_ids, input_ids - ) - if target_draft_equal: - break - # If tokens no longer change, should stop, means we have hit early stopping - last_current_equal = np.array_equal(last_input_ids, input_ids) - if last_current_equal: - break - # Need to check if stop words was encountered - hit_stop_words = self.encountered_stop_words( - input_ids, preproc.stop_words_list[0] - ) - if hit_stop_words: - break - - yield target_response - - def _draft_generate_non_streaming( - self, preproc: PreprocResponse, request: Request, num_draft_tokens: int - ) -> GenerationResponse: - raise NotImplementedError() - - def _generate( - self, - preproc: PreprocResponse, - request: Request, - draft_request: Optional[DraftRequest] = None, - ) -> Generator[GenerationResponse, None, None]: - raise NotImplementedError() - - def _generate_non_streaming( - self, - preproc: PreprocResponse, - request: Request, - draft_request: Optional[DraftRequest] = None, - ) -> GenerationResponse: - raise NotImplementedError() - - def postprocess(self, gen_response: GenerationResponse) -> Response: - if self._accumulate and self._streaming: - new_tokens: np.ndarray = gen_response.output_ids - if new_tokens.ndim != 3: - raise Exception("Expected output_ids tensor to have 3 dims.") - if new_tokens.shape[0] != 1: - raise Exception("Expected batch size of 1") - if new_tokens.shape[1] != 1: - raise Exception( - "Accumulation of tokens is only implemented for beam width = 1" - ) - - self._accumulated_tokens = ( - new_tokens - if (self._accumulated_tokens is None) - else np.concatenate((self._accumulated_tokens, new_tokens), axis=2) - ) - sequence_lengths = np.array( - [[self._accumulated_tokens.shape[2]]], dtype=np.int32 - ) - return self._postprocess( - self._accumulated_tokens, sequence_lengths, gen_response - ) - else: - return self._postprocess(gen_response.output_ids, None, gen_response) - - def _postprocess( - self, - tokens: np.ndarray, - sequence_lengths: Optional[np.ndarray], - gen_response: GenerationResponse, - ) -> Response: - raise NotImplementedError() - - def preprocess(self, request: Request) -> PreprocResponse: - raise NotImplementedError() - - def reset_decoder(self): - self._accumulated_tokens = None diff --git a/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/triton_decoder.py b/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/triton_decoder.py deleted file mode 100644 index 62c06f4836..0000000000 --- a/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/1/lib/triton_decoder.py +++ /dev/null @@ -1,478 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from collections.abc import Callable -from typing import Dict, Optional - -import numpy as np -import triton_python_backend_utils as pb_utils -from lib.decode import * -from typing_extensions import override - - -class TritonDecoder(Decoder): - def __init__( - self, - streaming=False, - accumulate=False, - preproc_model_name="preprocessing", - postproc_model_name="postprocessing", - llm_model_name="tensorrt_llm", - draft_llm_model_name: Optional[str] = None, - ): - super().__init__(streaming=streaming, accumulate=accumulate) - self.preproc_model_name = preproc_model_name - self.postproc_model_name = postproc_model_name - self.llm_model_name = llm_model_name - self.draft_llm_model_name = draft_llm_model_name - - self._preproc_outputs = [ - "INPUT_ID", - "DECODER_INPUT_ID", - "REQUEST_INPUT_LEN", - "REQUEST_DECODER_INPUT_LEN", - "BAD_WORDS_IDS", - "STOP_WORDS_IDS", - "EMBEDDING_BIAS", - "OUT_PAD_ID", - "OUT_END_ID", - ] - - self._llm_outputs = [ - "output_ids", - "sequence_length", - "cum_log_probs", - "output_log_probs", - "context_logits", - "generation_logits", - ] - - self._postproc_outputs = [ - "OUTPUT", - ] - - self.input_names = [ - "text_input", - "decoder_text_input", - "max_tokens", - "bad_words", - "stop_words", - "end_id", - "pad_id", - "top_k", - "top_p", - "temperature", - "length_penalty", - "repetition_penalty", - "min_length", - "presence_penalty", - "frequency_penalty", - "random_seed", - "return_log_probs", - "return_context_logits", - "return_generation_logits", - "beam_width", - "stream", - "prompt_embedding_table", - "prompt_vocab_size", - "embedding_bias_words", - "embedding_bias_weights", - "num_draft_tokens", - "use_draft_logits", - ] - - self.__undo_reshape_whitelist = { - "max_tokens", - "end_id", - "pad_id", - "top_k", - "top_p", - "temperature", - "length_penalty", - "repetition_penalty", - "min_length", - "presence_penalty", - "frequency_penalty", - "random_seed", - "return_log_probs", - "return_context_logits", - "return_generation_logits", - "beam_width", - "stream", - "prompt_vocab_size", - "num_draft_tokens", - "use_draft_logits", - } - - def _exec_triton_request(self, request): - responses = request.exec(decoupled=True) - for r in responses: - if r.has_error(): - raise pb_utils.TritonModelException(r.error().message()) - yield r - - def _exec_triton_request_single(self, request): - responses = request.exec(decoupled=False) - if responses.has_error(): - raise pb_utils.TritonModelException(responses.error().message()) - return responses - - def create_triton_response(self, response: Response): - name_map = { - "text_output": "text_output", - "cum_log_probs": "cum_log_probs", - "output_log_probs": "output_log_probs", - "context_logits": "context_logits", - "generation_logits": "generation_logits", - } - tensors = self.create_triton_tensors(response, name_map) - return pb_utils.InferenceResponse(output_tensors=tensors) - - def convert_triton_request(self, triton_request) -> Request: - request = Request() - for triton_name in self.input_names: - tensor = pb_utils.get_input_tensor_by_name(triton_request, triton_name) - target_name = triton_name - if tensor is None: - continue - if not hasattr(request, target_name): - raise AttributeError(f"Request has no attribute '{target_name}'") - setattr(request, target_name, tensor.as_numpy()) - return request - - def convert_triton_response( - self, triton_response, response_factory: Callable, name_map=None - ): - response = response_factory() - for tensor in triton_response.output_tensors(): - if tensor is None: - continue - triton_name = tensor.name() - value = tensor.as_numpy() - target_name = triton_name - if name_map and triton_name in name_map: - target_name = name_map[triton_name] - if name_map and not triton_name in name_map: - continue - if target_name is None: - # explicitly ignore this triton input - continue - if not hasattr(response, target_name): - raise AttributeError( - f"response object has not attribute '{target_name}'" - ) - setattr(response, target_name, value) - return response - - def __undo_reshape(self, x, name): - if name in self.__undo_reshape_whitelist and len(x.shape) == 1: - # handle reshapes - return np.expand_dims(x, 0) - else: - return x - - def create_triton_tensors(self, obj, name_map: dict): - tensors = [] - for name, triton_name in name_map.items(): - if triton_name is None: - continue - value = getattr(obj, name) - if value is None: - continue - t = pb_utils.Tensor(triton_name, self.__undo_reshape(value, name)) - tensors.append(t) - return tensors - - @override - def preprocess(self, request: Request) -> PreprocResponse: - input_tensors = self._get_preproc_tensors(request) - triton_req = pb_utils.InferenceRequest( - model_name=self.preproc_model_name, - inputs=input_tensors, - requested_output_names=self._preproc_outputs, - ) - triton_output = self._exec_triton_request_single(triton_req) - return self._get_preproc_response(triton_output) - - def _get_preproc_tensors(self, request: Request): - name_map = { - "text_input": "QUERY", - "decoder_text_input": "DECODER_QUERY", - "max_tokens": "REQUEST_OUTPUT_LEN", - "bad_words": "BAD_WORDS_DICT", - "stop_words": "STOP_WORDS_DICT", - "embedding_bias_words": "EMBEDDING_BIAS_WORDS", - "embedding_bias_weights": "EMBEDDING_BIAS_WEIGHTS", - "pad_id": "PAD_ID", - "end_id": "END_ID", - } - return self.create_triton_tensors(request, name_map) - - def _get_preproc_response(self, triton_output): - name_map = { - "INPUT_ID": "input_ids", - "DECODER_INPUT_ID": "decoder_input_ids", - "REQUEST_INPUT_LEN": "input_lengths", - "REQUEST_DECODER_INPUT_LEN": "decoder_input_lengths", - "BAD_WORDS_IDS": "bad_words_list", - "STOP_WORDS_IDS": "stop_words_list", - "EMBEDDING_BIAS": "embedding_bias", - "OUT_PAD_ID": "pad_id", - "OUT_END_ID": "end_id", - } - return self.convert_triton_response(triton_output, PreprocResponse, name_map) - - @override - def _draft_generate_non_streaming( - self, preproc: PreprocResponse, request: Request, num_draft_tokens: int - ) -> GenerationResponse: - input_tensors = self._get_llm_tensors( - preproc, request, num_draft_tokens, None, True - ) - triton_req = pb_utils.InferenceRequest( - model_name=self.draft_llm_model_name, - inputs=input_tensors, - requested_output_names=self._llm_outputs, - ) - triton_response = self._exec_triton_request_single(triton_req) - llm_response = self._get_llm_response(triton_response) - return llm_response - - @override - def _generate( - self, - preproc: PreprocResponse, - request: Request, - draft_request: Optional[DraftRequest] = None, - ) -> Generator[GenerationResponse, None, None]: - input_tensors = self._get_llm_tensors(preproc, request, None, draft_request) - triton_req = pb_utils.InferenceRequest( - model_name=self.llm_model_name, - inputs=input_tensors, - requested_output_names=self._llm_outputs, - ) - for r in self._exec_triton_request(triton_req): - yield self._get_llm_response(r) - - @override - def _generate_non_streaming( - self, - preproc: PreprocResponse, - request: Request, - draft_request: Optional[DraftRequest] = None, - ) -> GenerationResponse: - input_tensors = self._get_llm_tensors(preproc, request, None, draft_request) - triton_req = pb_utils.InferenceRequest( - model_name=self.llm_model_name, - inputs=input_tensors, - requested_output_names=self._llm_outputs, - ) - r = self._exec_triton_request_single(triton_req) - return self._get_llm_response(r) - - def _get_llm_tensors( - self, - preproc: PreprocResponse, - request: Request, - num_output_tokens: Optional[int] = None, - draft_request: Optional[DraftRequest] = None, - is_draft_model_request: bool = False, - ): - tensors = [] - # print(f"[get_llm_tensors] {request.temperature=}") - tensors.extend(self._get_tensors_from_preproc(preproc)) - tensors.extend( - self._get_llm_tensors_from_request( - request, num_output_tokens, draft_request, is_draft_model_request - ) - ) - return tensors - - def _get_tensors_from_preproc(self, preproc: PreprocResponse): - name_map = { - "input_ids": "input_ids", - "decoder_input_ids": "decoder_input_ids", - "input_lengths": "input_lengths", - "bad_words_list": "bad_words_list", - "stop_words_list": "stop_words_list", - "embedding_bias": "embedding_bias", - "pad_id": "pad_id", - "end_id": "end_id", - } - return self.create_triton_tensors(preproc, name_map) - - def _get_llm_tensors_from_request( - self, - request: Request, - num_output_tokens: Optional[int] = None, - draft_request: Optional[DraftRequest] = None, - is_draft_model_request: bool = False, - ): - name_map: Dict[str, Optional[str]] = { - "beam_width": "beam_width", - "top_k": "runtime_top_k", - "top_p": "runtime_top_p", - # "temperature": "temperature", - "length_penalty": "len_penalty", - "repetition_penalty": "repetition_penalty", - "min_length": "min_length", - "presence_penalty": "presence_penalty", - "frequency_penalty": "frequency_penalty", - "random_seed": "random_seed", - "return_log_probs": "return_log_probs", - "stream": "streaming", - "prompt_embedding_table": "prompt_embedding_table", - "prompt_vocab_size": "prompt_vocab_size", - } - # print(f"[get_llm_tensors_from_request] {request.temperature=}") - temp_found = "temperature" in name_map - # print(f"[get_llm_tensors_from_request] temperature in name_map = {temp_found}") - tensors = self.create_triton_tensors(request, name_map) - - out_len = request.max_tokens[0][0] if request.max_tokens else None - if num_output_tokens is not None: - out_len = num_output_tokens - elif draft_request: - if draft_request.draft_input_ids is not None: - out_len = len(draft_request.draft_input_ids[0]) + 1 - else: - out_len = 1 - - if out_len is None: - raise Exception("Could not determine request_output_len") - else: - tensors.append( - pb_utils.Tensor( - "request_output_len", np.array([[out_len]], dtype=np.int32) - ) - ) - - if draft_request: - if draft_request.draft_input_ids is not None: - tensors.append( - pb_utils.Tensor("draft_input_ids", draft_request.draft_input_ids) - ) - if ( - draft_request.draft_logits is not None - and request.use_draft_logits is not None - and request.use_draft_logits[0] - ): - tensors.append( - pb_utils.Tensor("draft_logits", draft_request.draft_logits) - ) - - return_context_logits = False - return_generation_logits = False - if draft_request is None: - if is_draft_model_request: - return_generation_logits = ( - request.use_draft_logits[0] - if request.use_draft_logits is not None - else False - ) - else: - return_context_logits = ( - request.return_context_logits[0] - if request.return_context_logits is not None - else False - ) - return_generation_logits = ( - request.return_generation_logits[0] - if request.return_generation_logits is not None - else False - ) - - tensors.append( - pb_utils.Tensor( - "return_context_logits", np.array([[return_context_logits]]) - ) - ) - tensors.append( - pb_utils.Tensor( - "return_generation_logits", np.array([[return_generation_logits]]) - ) - ) - return tensors - - def _get_llm_response(self, triton_output): - name_map = { - "output_ids": "output_ids", - "sequence_length": "sequence_length", - "cum_log_probs": "cum_log_probs", - "output_log_probs": "output_log_probs", - "context_logits": "context_logits", - "generation_logits": "generation_logits", - } - return self.convert_triton_response(triton_output, GenerationResponse, name_map) - - def _postprocess( - self, - tokens: np.ndarray, - sequence_lengths: Optional[np.ndarray], - gen_response: GenerationResponse, - ) -> Response: - input_tensors = self._get_postproc_tensors( - tokens, sequence_lengths, gen_response - ) - triton_req = pb_utils.InferenceRequest( - model_name=self.postproc_model_name, - inputs=input_tensors, - requested_output_names=self._postproc_outputs, - ) - r = self._exec_triton_request_single(triton_req) - response = self._get_response(r, gen_response) - return response - - def _get_postproc_tensors( - self, - tokens: np.ndarray, - sequence_lengths: Optional[np.ndarray], - gen_response: GenerationResponse, - ): - tensors = [ - pb_utils.Tensor("TOKENS_BATCH", tokens), - pb_utils.Tensor( - "SEQUENCE_LENGTH", - sequence_lengths if sequence_lengths else gen_response.sequence_length, - ), - ] - return tensors - - def _get_response(self, triton_output, gen_res: GenerationResponse): - tensors = triton_output.output_tensors() - t_map = {} - for named_t in tensors: - name = named_t.name() - t = named_t.as_numpy() - t_map[name] = t - response = Response( - text_output=t_map["OUTPUT"], - cum_log_probs=gen_res.cum_log_probs, - output_log_probs=gen_res.output_log_probs, - context_logits=gen_res.context_logits, - generation_logits=gen_res.generation_logits, - ) - return response diff --git a/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/1/model.py b/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/1/model.py deleted file mode 100644 index 0a5d54546d..0000000000 --- a/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/1/model.py +++ /dev/null @@ -1,137 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import json -import traceback - -import triton_python_backend_utils as pb_utils -from lib.triton_decoder import TritonDecoder - - -class TritonPythonModel: - def initialize(self, args): - # Parse model configs - model_config = json.loads(args["model_config"]) - - params = model_config["parameters"] - - accumulate_tokens_str = "" - if "accumulate_tokens" in params: - accumulate_tokens_str = params["accumulate_tokens"]["string_value"] - - self.accumulate_tokens = accumulate_tokens_str.lower() in [ - "true", - "yes", - "1", - "t", - ] - - self.decoupled = pb_utils.using_decoupled_model_transaction_policy(model_config) - - self.logger = pb_utils.Logger - - self.llm_model_name = "tensorrt_llm" - if "tensorrt_llm_model_name" in params: - self.llm_model_name = params["tensorrt_llm_model_name"]["string_value"] - self.draft_llm_model_name = None - if "tensorrt_llm_draft_model_name" in params: - self.draft_llm_model_name = params["tensorrt_llm_draft_model_name"][ - "string_value" - ] - - self.decoder = TritonDecoder( - streaming=self.decoupled, - accumulate=self.accumulate_tokens, - preproc_model_name="preprocessing", - postproc_model_name="postprocessing", - llm_model_name=self.llm_model_name, - draft_llm_model_name=self.draft_llm_model_name, - ) - - def execute(self, requests): - responses = [] - - for request in requests: - if self.decoupled: - response_sender = request.get_response_sender() - try: - req = self.decoder.convert_triton_request(request) - req.validate() - # print(f"[DEBUG] ========= [bls model.py] {req.temperature=} ===========") - speculative_decode = ( - req.num_draft_tokens is not None and req.num_draft_tokens[0][0] > 0 - ) - if speculative_decode and ( - self.draft_llm_model_name is None or self.draft_llm_model_name == "" - ): - raise Exception( - "cannot perform speculative decoding without draft model" - ) - res_gen = self.decoder.decode( - req, speculative_decoding=speculative_decode - ) - - for res in res_gen: - triton_response = self.decoder.create_triton_response(res) - if self.decoupled: - response_sender.send(triton_response) - else: - responses.append(triton_response) - - if self.decoupled: - response_sender.send( - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL - ) - - except Exception: - self.logger.log_error(traceback.format_exc()) - # If encountering an error, send a response with err msg - error_response = pb_utils.InferenceResponse( - output_tensors=[], - error=pb_utils.TritonError(traceback.format_exc()), - ) - - if self.decoupled: - response_sender.send(error_response) - response_sender.send( - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL - ) - else: - responses.append(error_response) - - self.decoder.reset_decoder() - if self.decoupled: - return None - else: - assert len(responses) == len(requests) - return responses - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is optional. This function allows - the model to perform any necessary clean ups before exit. - """ - print("Cleaning up...") diff --git a/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/config.pbtxt b/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/config.pbtxt deleted file mode 100644 index aa3b26336c..0000000000 --- a/python/openai/openai/tests/tensorrtllm_models/tensorrt_llm_bls/config.pbtxt +++ /dev/null @@ -1,252 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -backend: "python" -max_batch_size: 256 - -model_transaction_policy { - decoupled: True -} - -input [ - { - name: "text_input" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "decoder_text_input" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "max_tokens" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "bad_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "stop_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "end_id" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "pad_id" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "top_k" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "top_p" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "temperature" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "length_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "repetition_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "min_length" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "presence_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "frequency_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "random_seed" - data_type: TYPE_UINT64 - dims: [ 1 ] - optional: true - }, - { - name: "return_log_probs" - data_type: TYPE_BOOL - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "return_context_logits" - data_type: TYPE_BOOL - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "return_generation_logits" - data_type: TYPE_BOOL - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "beam_width" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "stream" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "prompt_embedding_table" - data_type: TYPE_FP16 - dims: [ -1, -1 ] - optional: true - }, - { - name: "prompt_vocab_size" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "embedding_bias_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "embedding_bias_weights" - data_type: TYPE_FP32 - dims: [ -1 ] - optional: true - }, - { - name: "num_draft_tokens", - data_type: TYPE_INT32, - dims: [ 1 ] - optional: true - }, - { - name: "use_draft_logits", - data_type: TYPE_BOOL, - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - } -] -output [ - { - name: "text_output" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "cum_log_probs" - data_type: TYPE_FP32 - dims: [ -1 ] - }, - { - name: "output_log_probs" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - }, - { - name: "context_logits" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - }, - { - name: "generation_logits" - data_type: TYPE_FP32 - dims: [ -1, -1, -1 ] - } -] - -parameters: { - key: "accumulate_tokens" - value: { - string_value: "${accumulate_tokens}" - } -} -parameters: { - key: "tensorrt_llm_model_name" - value: { - string_value: "tensorrt_llm" - } -} -parameters: { - key: "tensorrt_llm_draft_model_name" - value: { - string_value: "" - } -} - -instance_group [ - { - count: 1 - kind : KIND_CPU - } -] diff --git a/python/openai/openai/tests/test_chat_completions.py b/python/openai/openai/tests/test_chat_completions.py index 7e5548252a..d15b636dbc 100644 --- a/python/openai/openai/tests/test_chat_completions.py +++ b/python/openai/openai/tests/test_chat_completions.py @@ -83,7 +83,7 @@ def test_chat_completions_sampling_parameters( }, ) - # TODO: Add support and remove this check + # FIXME: Add support and remove this check unsupported_parameters = ["logprobs", "logit_bias"] if param_key in unsupported_parameters: assert response.status_code == 400 @@ -421,7 +421,6 @@ def test_request_logprobs(self): def test_request_logit_bias(self): pass - # TODO: Do we want to support "usage" field for token counts in response? @pytest.mark.skip(reason="Not Implemented Yet") def test_usage_response(self): pass diff --git a/python/openai/openai/tests/test_openai_client.py b/python/openai/openai/tests/test_openai_client.py index 7a2a08fe19..021fb5e883 100644 --- a/python/openai/openai/tests/test_openai_client.py +++ b/python/openai/openai/tests/test_openai_client.py @@ -113,17 +113,48 @@ async def test_openai_client_chat_completion( assert chat_completion.choices[0].finish_reason == "stop" print(f"Chat completion results: {chat_completion}") - # TODO: Add this test - @pytest.mark.skip(reason="Not Implemented Yet") @pytest.mark.asyncio - async def test_completion_streaming(self): - pass + async def test_completion_streaming( + self, client: openai.AsyncOpenAI, model: str, prompt: str + ): + # Test single completion for comparison + chat_completion = await client.completions.create( + model=model, + prompt=prompt, + max_tokens=10, + temperature=0.0, + stream=False, + ) + output = chat_completion.choices[0].text + stop_reason = chat_completion.choices[0].finish_reason + + # Test streaming + stream = await client.completions.create( + model=model, + prompt=prompt, + max_tokens=10, + temperature=0.0, + stream=True, + ) + chunks = [] + finish_reason_count = 0 + async for chunk in stream: + delta = chunk.choices[0] + if delta.text: + chunks.append(delta.text) + if delta.finish_reason is not None: + finish_reason_count += 1 + + # finish reason should only return in last block + assert finish_reason_count == 1 + assert chunk.choices[0].finish_reason == stop_reason + assert "".join(chunks) == output @pytest.mark.asyncio async def test_chat_streaming( self, client: openai.AsyncOpenAI, model: str, messages: List[dict] ): - # test single completion + # Test single chat completion for comparison chat_completion = await client.chat.completions.create( model=model, messages=messages, @@ -134,7 +165,7 @@ async def test_chat_streaming( output = chat_completion.choices[0].message.content stop_reason = chat_completion.choices[0].finish_reason - # test streaming + # Test streaming stream = await client.chat.completions.create( model=model, messages=messages, @@ -152,6 +183,7 @@ async def test_chat_streaming( chunks.append(delta.content) if chunk.choices[0].finish_reason is not None: finish_reason_count += 1 + # finish reason should only return in last block assert finish_reason_count == 1 assert chunk.choices[0].finish_reason == stop_reason From d00d23702097e4bfe586f49ce55b2c0a36ee4e0c Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Wed, 21 Aug 2024 17:03:06 -0700 Subject: [PATCH 51/80] Remove unnecessary TODOs --- python/openai/openai/src/api_server.py | 4 +- .../openai/src/routers/chat_completions.py | 4 -- python/openai/openai/src/schemas/openai.py | 4 -- python/openai/openai/src/utils/triton.py | 48 +++---------------- 4 files changed, 8 insertions(+), 52 deletions(-) diff --git a/python/openai/openai/src/api_server.py b/python/openai/openai/src/api_server.py index 1b7543a4a0..5ca53ab947 100644 --- a/python/openai/openai/src/api_server.py +++ b/python/openai/openai/src/api_server.py @@ -15,6 +15,7 @@ def add_cors_middleware(app: FastAPI): "http://localhost", ] + # TODO: Move towards logger instead of printing print(f"[WARNING] Adding CORS for the following origins: {origins}") app.add_middleware( CORSMiddleware, @@ -68,7 +69,4 @@ def init_app(): # NOTE: For debugging purposes, should generally be restricted or removed add_cors_middleware(app) - # TODO: Add common logger and use logger.debug in place of current print - # statements for debugging purposes. - return app diff --git a/python/openai/openai/src/routers/chat_completions.py b/python/openai/openai/src/routers/chat_completions.py index a970574e71..9711d1b598 100644 --- a/python/openai/openai/src/routers/chat_completions.py +++ b/python/openai/openai/src/routers/chat_completions.py @@ -83,10 +83,6 @@ def create_chat_completion( Creates a model response for the given chat conversation. """ - # TODO: Cleanup - print(f"[DEBUG] Available model metadata: {raw_request.app.models.keys()=}") - print(f"[DEBUG] Fetching model metadata for {request.model=}") - model_metadatas = raw_request.app.models if not model_metadatas: raise HTTPException(status_code=400, detail="No known models") diff --git a/python/openai/openai/src/schemas/openai.py b/python/openai/openai/src/schemas/openai.py index 488dfda3bb..194ad681c5 100644 --- a/python/openai/openai/src/schemas/openai.py +++ b/python/openai/openai/src/schemas/openai.py @@ -252,10 +252,6 @@ class ChatCompletionRequestFunctionMessage(BaseModel): class FunctionParameters(BaseModel): model_config = ConfigDict(extra="allow") - # class Config: - # # TODO: Remove - # #extra = Extra.allow - # extra = "allow" class ChatCompletionFunctions(BaseModel): diff --git a/python/openai/openai/src/utils/triton.py b/python/openai/openai/src/utils/triton.py index 42a92fa34d..86fccd18a3 100644 --- a/python/openai/openai/src/utils/triton.py +++ b/python/openai/openai/src/utils/triton.py @@ -9,13 +9,8 @@ from src.schemas.openai import CreateChatCompletionRequest, CreateCompletionRequest from src.utils.tokenizer import get_tokenizer -# TODO: Refactor -# NOTE: Allow python backend for testing purposes -SUPPORTED_BACKENDS: set = {"vllm", "tensorrtllm", "python"} -LLM_BACKENDS: set = {"vllm", "tensorrtllm"} - -# TODO: pydantic validation? +# TODO: Stricter pydantic validation would be better in future @dataclass class TritonModelMetadata: # Name used in Triton model repository @@ -24,14 +19,15 @@ class TritonModelMetadata: backend: str # Triton model object handle model: tritonserver.Model - # TODO: Address typing + # Tokenizers used for chat templates tokenizer: typing.Optional[typing.Any] # Time that model was loaded by Triton create_time: int - # TODO: Address typing + # Conversion format between OpenAI and Triton requests request_convert_fn: typing.Optional[typing.Any] +# TODO: Expose explicit flag to catch edge cases def determine_request_format(backend): # Request conversion from OpenAI format to backend-specific format if backend == "vllm": @@ -45,35 +41,11 @@ def determine_request_format(backend): return request_convert_fn -# TODO: Refactor: -# NOTE: We need to figure out a few things while looking at the models in the -# triton model repository. -# 1. Which model should we interact with when sending requests to Triton core? -# a. For a single model, this is trivial, and would support any backend. -# b. For TRT-LLM, this should be 'ensemble' or 'tensorrt_llm_bls' following -# TRT-LLM defaults/examples. However, this could also be renamed by the user -# to have a more intuitive front-facing name, such as "llama3-8b". Note that -# TRT-LLM pipelines produced by the Triton CLI will generally be renamed like -# this. FIXME: This is a relatively fragile flow and should be improved. -# 2. Which tokenizer to use for things like applying a chat template or making -# a tool/function call. These are primarily relevant for the /chat/completions -# endpoint, but not the /completions endpoint. -# - For now, require user-defined TOKENIZER for simplicity. -# 3. Which inputs/outputs/parameters should be set when creating the underlying -# triton inference request? The inference request fields required will differ -# for vLLM, TRT-LLM, and user-defined models like a custom python model. So we -# need to know how to correctly translate the OpenAI schema parameters to -# a triton inference request. -# - For now, we will look for either vllm or trtllm in list of loaded backends, -# and we consider python==trtllm for now due to possibility of python runtime. -# We may want to consider using Triton's "runtime" config field for this for -# easier detection instead. def load_models(server): model_metadatas = [] backends = [] - # TODO: Support tokenizers more generically or custom tokenizers, possibly - # by looking for tokenizer.json in a pre-specified location? + # TODO: Consider support for custom tokenizers tokenizer = None tokenizer_model = os.environ.get("TOKENIZER") if tokenizer_model: @@ -85,7 +57,7 @@ def load_models(server): names = [] # Load all triton models and gather the respective backends of each for name, version in server.models().keys(): - # TODO: Why skip known version? Already loaded? + # Skip models that are already loaded, if any if version != -1: continue @@ -122,7 +94,7 @@ def init_tritonserver(): ) log_verbose_level = int(os.environ.get("TRITON_LOG_VERBOSE_LEVEL", "0")) - print("Starting Triton Server Core...") + print("Starting Triton Server...") server = tritonserver.Server( model_repository=model_repository, log_verbose=log_verbose_level, @@ -177,8 +149,6 @@ def create_vllm_inference_request( exclude=excludes, exclude_none=True, ) - print(f"[DEBUG] {sampling_parameters=}") - inputs["text_input"] = [prompt] inputs["stream"] = [request.stream] exclude_input_in_output = True @@ -186,8 +156,6 @@ def create_vllm_inference_request( if echo: exclude_input_in_output = not echo inputs["exclude_input_in_output"] = [exclude_input_in_output] - - print(f"[DEBUG] Triton Inference Request {inputs=}") return model.create_request(inputs=inputs, parameters=sampling_parameters) @@ -214,6 +182,4 @@ def create_trtllm_inference_request( inputs["random_seed"] = np.uint64([[request.seed]]) if request.temperature is not None: inputs["temperature"] = np.float32([[request.temperature]]) - - print(f"[DEBUG] Triton Inference Request {inputs=}") return model.create_request(inputs=inputs) From ae2fcd6b6b93b410e0d7682e7ea29c67d68af1b7 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Wed, 21 Aug 2024 17:23:23 -0700 Subject: [PATCH 52/80] Add copyrights and replace dupe test model --- python/openai/docker/Dockerfile.tensorrtllm | 26 +++++ python/openai/docker/Dockerfile.vllm | 26 +++++ python/openai/openai/main.py | 31 ++++- python/openai/openai/src/__init__.py | 25 ++++ python/openai/openai/src/api_server.py | 26 +++++ python/openai/openai/src/routers/__init__.py | 25 ++++ .../openai/src/routers/chat_completions.py | 26 +++++ .../openai/openai/src/routers/completions.py | 26 +++++ python/openai/openai/src/routers/models.py | 26 +++++ .../openai/src/routers/observability.py | 26 +++++ python/openai/openai/src/schemas/__init__.py | 25 ++++ python/openai/openai/src/utils/__init__.py | 25 ++++ python/openai/openai/src/utils/triton.py | 26 +++++ python/openai/openai/tests/__init__.py | 25 ++++ python/openai/openai/tests/conftest.py | 26 +++++ .../openai/tests/test_chat_completions.py | 26 +++++ .../openai/openai/tests/test_completions.py | 26 +++++ .../tests/test_models/identity_py/1/model.py | 40 +++++++ .../{mock_llm_2 => identity_py}/config.pbtxt | 27 ++--- .../tests/test_models/mock_llm/1/model.py | 2 +- .../tests/test_models/mock_llm_2/1/model.py | 108 ------------------ .../openai/openai/tests/test_observability.py | 30 ++++- .../openai/openai/tests/test_openai_client.py | 26 +++++ python/openai/openai/tests/utils.py | 26 +++++ 24 files changed, 571 insertions(+), 130 deletions(-) create mode 100644 python/openai/openai/tests/test_models/identity_py/1/model.py rename python/openai/openai/tests/test_models/{mock_llm_2 => identity_py}/config.pbtxt (84%) delete mode 100644 python/openai/openai/tests/test_models/mock_llm_2/1/model.py diff --git a/python/openai/docker/Dockerfile.tensorrtllm b/python/openai/docker/Dockerfile.tensorrtllm index 922c38a18f..7bdce7499e 100644 --- a/python/openai/docker/Dockerfile.tensorrtllm +++ b/python/openai/docker/Dockerfile.tensorrtllm @@ -1,3 +1,29 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 FROM ${BASE_IMAGE} diff --git a/python/openai/docker/Dockerfile.vllm b/python/openai/docker/Dockerfile.vllm index 15c6c7a122..5be42b4fbe 100644 --- a/python/openai/docker/Dockerfile.vllm +++ b/python/openai/docker/Dockerfile.vllm @@ -1,3 +1,29 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.07-vllm-python-py3 FROM ${BASE_IMAGE} diff --git a/python/openai/openai/main.py b/python/openai/openai/main.py index 4f6f11a9f9..eb28bad132 100755 --- a/python/openai/openai/main.py +++ b/python/openai/openai/main.py @@ -1,4 +1,31 @@ #!/usr/bin/env python3 + +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import argparse import os @@ -48,8 +75,8 @@ def parse_args(): if __name__ == "__main__": args = parse_args() - # NOTE: Think about other ways to pass triton args to fastapi app, - # but use env vars for simplicity for now. + # NOTE: configurations can be passed to FastAPI app through a builder + # function like init_app in future, but using env vars for simplicity. if args.model_repository: os.environ["TRITON_MODEL_REPOSITORY"] = args.model_repository if args.tokenizer: diff --git a/python/openai/openai/src/__init__.py b/python/openai/openai/src/__init__.py index e69de29bb2..dc1c939c66 100644 --- a/python/openai/openai/src/__init__.py +++ b/python/openai/openai/src/__init__.py @@ -0,0 +1,25 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/python/openai/openai/src/api_server.py b/python/openai/openai/src/api_server.py index 5ca53ab947..105da9c836 100644 --- a/python/openai/openai/src/api_server.py +++ b/python/openai/openai/src/api_server.py @@ -1,3 +1,29 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from __future__ import annotations from contextlib import asynccontextmanager diff --git a/python/openai/openai/src/routers/__init__.py b/python/openai/openai/src/routers/__init__.py index e69de29bb2..dc1c939c66 100644 --- a/python/openai/openai/src/routers/__init__.py +++ b/python/openai/openai/src/routers/__init__.py @@ -0,0 +1,25 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/python/openai/openai/src/routers/chat_completions.py b/python/openai/openai/src/routers/chat_completions.py index 9711d1b598..1a3c61cecd 100644 --- a/python/openai/openai/src/routers/chat_completions.py +++ b/python/openai/openai/src/routers/chat_completions.py @@ -1,3 +1,29 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import time import uuid diff --git a/python/openai/openai/src/routers/completions.py b/python/openai/openai/src/routers/completions.py index 5d1e9b12fa..7fdd23a424 100644 --- a/python/openai/openai/src/routers/completions.py +++ b/python/openai/openai/src/routers/completions.py @@ -1,3 +1,29 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import time import uuid diff --git a/python/openai/openai/src/routers/models.py b/python/openai/openai/src/routers/models.py index ff47000cfd..14cda956fe 100644 --- a/python/openai/openai/src/routers/models.py +++ b/python/openai/openai/src/routers/models.py @@ -1,3 +1,29 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from fastapi import APIRouter, HTTPException, Request from src.schemas.openai import ListModelsResponse, Model, ObjectType diff --git a/python/openai/openai/src/routers/observability.py b/python/openai/openai/src/routers/observability.py index 98d506dab5..9f18f9934c 100644 --- a/python/openai/openai/src/routers/observability.py +++ b/python/openai/openai/src/routers/observability.py @@ -1,3 +1,29 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from fastapi import APIRouter, HTTPException, Request from fastapi.responses import Response diff --git a/python/openai/openai/src/schemas/__init__.py b/python/openai/openai/src/schemas/__init__.py index e69de29bb2..dc1c939c66 100644 --- a/python/openai/openai/src/schemas/__init__.py +++ b/python/openai/openai/src/schemas/__init__.py @@ -0,0 +1,25 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/python/openai/openai/src/utils/__init__.py b/python/openai/openai/src/utils/__init__.py index e69de29bb2..dc1c939c66 100644 --- a/python/openai/openai/src/utils/__init__.py +++ b/python/openai/openai/src/utils/__init__.py @@ -0,0 +1,25 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/python/openai/openai/src/utils/triton.py b/python/openai/openai/src/utils/triton.py index 86fccd18a3..966c1c2c02 100644 --- a/python/openai/openai/src/utils/triton.py +++ b/python/openai/openai/src/utils/triton.py @@ -1,3 +1,29 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import os import time import typing diff --git a/python/openai/openai/tests/__init__.py b/python/openai/openai/tests/__init__.py index e69de29bb2..dc1c939c66 100644 --- a/python/openai/openai/tests/__init__.py +++ b/python/openai/openai/tests/__init__.py @@ -0,0 +1,25 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/python/openai/openai/tests/conftest.py b/python/openai/openai/tests/conftest.py index da6301fa04..e00d9157bd 100644 --- a/python/openai/openai/tests/conftest.py +++ b/python/openai/openai/tests/conftest.py @@ -1,3 +1,29 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from pathlib import Path import pytest diff --git a/python/openai/openai/tests/test_chat_completions.py b/python/openai/openai/tests/test_chat_completions.py index d15b636dbc..f2fbfbef74 100644 --- a/python/openai/openai/tests/test_chat_completions.py +++ b/python/openai/openai/tests/test_chat_completions.py @@ -1,3 +1,29 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import copy from pathlib import Path from typing import List diff --git a/python/openai/openai/tests/test_completions.py b/python/openai/openai/tests/test_completions.py index b5b7a6f2f5..ce0e5d2c92 100644 --- a/python/openai/openai/tests/test_completions.py +++ b/python/openai/openai/tests/test_completions.py @@ -1,3 +1,29 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import copy import pytest diff --git a/python/openai/openai/tests/test_models/identity_py/1/model.py b/python/openai/openai/tests/test_models/identity_py/1/model.py new file mode 100644 index 0000000000..7bbe4bf991 --- /dev/null +++ b/python/openai/openai/tests/test_models/identity_py/1/model.py @@ -0,0 +1,40 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def execute(self, requests): + """ + Identity model in Python backend. + """ + responses = [] + for request in requests: + input_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0") + out_tensor = pb_utils.Tensor("OUTPUT0", input_tensor.as_numpy()) + responses.append(pb_utils.InferenceResponse([out_tensor])) + return responses diff --git a/python/openai/openai/tests/test_models/mock_llm_2/config.pbtxt b/python/openai/openai/tests/test_models/identity_py/config.pbtxt similarity index 84% rename from python/openai/openai/tests/test_models/mock_llm_2/config.pbtxt rename to python/openai/openai/tests/test_models/identity_py/config.pbtxt index 5f665ff543..3926c830cb 100644 --- a/python/openai/openai/tests/test_models/mock_llm_2/config.pbtxt +++ b/python/openai/openai/tests/test_models/identity_py/config.pbtxt @@ -23,38 +23,29 @@ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -backend: "python" - -max_batch_size: 0 -model_transaction_policy { - decoupled: True -} +backend: "python" +max_batch_size: 64 input [ { - name: "text_input" - data_type: TYPE_STRING - dims: [ 1, 1 ] - }, - { - name: "stream" - data_type: TYPE_BOOL - dims: [ 1, 1 ] + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] } ] output [ { - name: "text_output" - data_type: TYPE_STRING - dims: [ 1, -1 ] + name: "OUTPUT0" + data_type: TYPE_FP32 + dims: [ -1 ] } ] instance_group [ { count: 1 - kind: KIND_MODEL + kind : KIND_CPU } ] diff --git a/python/openai/openai/tests/test_models/mock_llm/1/model.py b/python/openai/openai/tests/test_models/mock_llm/1/model.py index 1cf5f3613c..0fc9053cd3 100644 --- a/python/openai/openai/tests/test_models/mock_llm/1/model.py +++ b/python/openai/openai/tests/test_models/mock_llm/1/model.py @@ -1,4 +1,4 @@ -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions diff --git a/python/openai/openai/tests/test_models/mock_llm_2/1/model.py b/python/openai/openai/tests/test_models/mock_llm_2/1/model.py deleted file mode 100644 index 1cf5f3613c..0000000000 --- a/python/openai/openai/tests/test_models/mock_llm_2/1/model.py +++ /dev/null @@ -1,108 +0,0 @@ -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import json -import time - -import numpy as np -import triton_python_backend_utils as pb_utils - - -class TritonPythonModel: - def initialize(self, args): - self.model_config = json.loads(args["model_config"]) - self.decoupled = self.model_config.get("model_transaction_policy", {}).get( - "decoupled" - ) - - def execute(self, requests): - if self.decoupled: - return self.exec_decoupled(requests) - else: - return self.exec(requests) - - def exec(self, requests): - responses = [] - for request in requests: - params = json.loads(request.parameters()) - rep_count = params["REPETITION"] if "REPETITION" in params else 1 - - input_np = pb_utils.get_input_tensor_by_name( - request, "text_intpu" - ).as_numpy() - stream_np = pb_utils.get_input_tensor_by_name(request, "stream").as_numpy() - stream = stream_np.flatten()[0] - if stream: - responses.append( - pb_utils.InferenceResponse( - error=pb_utils.TritonError( - "STREAM only supported in decoupled mode" - ) - ) - ) - else: - out_tensor = pb_utils.Tensor( - "text_output", np.repeat(input_np, rep_count, axis=1) - ) - responses.append(pb_utils.InferenceResponse([out_tensor])) - return responses - - def exec_decoupled(self, requests): - for request in requests: - params = json.loads(request.parameters()) - rep_count = params["REPETITION"] if "REPETITION" in params else 1 - fail_last = params["FAIL_LAST"] if "FAIL_LAST" in params else False - delay = params["DELAY"] if "DELAY" in params else None - - sender = request.get_response_sender() - input_np = pb_utils.get_input_tensor_by_name( - request, "text_input" - ).as_numpy() - stream_np = pb_utils.get_input_tensor_by_name(request, "stream").as_numpy() - out_tensor = pb_utils.Tensor("text_output", input_np) - response = pb_utils.InferenceResponse([out_tensor]) - # If stream enabled, just send multiple copies of response - # FIXME: Could split up response string into tokens, but this is simpler for now. - stream = stream_np.flatten()[0] - if stream: - for _ in range(rep_count): - if delay is not None: - time.sleep(delay) - sender.send(response) - sender.send( - None - if not fail_last - else pb_utils.InferenceResponse( - error=pb_utils.TritonError("An Error Occurred") - ), - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL, - ) - # If stream disabled, just send one response - else: - sender.send( - response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL - ) - return None diff --git a/python/openai/openai/tests/test_observability.py b/python/openai/openai/tests/test_observability.py index 7a4a4d7e81..7158e10b19 100644 --- a/python/openai/openai/tests/test_observability.py +++ b/python/openai/openai/tests/test_observability.py @@ -1,3 +1,29 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import os from pathlib import Path @@ -44,7 +70,7 @@ def test_startup_fail(self): def test_startup_metrics(self, client): response = client.get("/metrics") assert response.status_code == 200 - # FIXME: Flesh out more + # TODO: Flesh out metrics tests further # NOTE: response.json() works even on non-json prometheus data assert "nv_cpu_utilization" in response.json() @@ -53,6 +79,8 @@ def test_models_list(self, client): response = client.get("/v1/models") assert response.status_code == 200 models = response.json()["data"] + # Two models are in test_models specifically to verify that all models + # are listed by this endpoint. This can be removed if the behavior changes. assert len(models) == 2 for model in models: assert model["id"] diff --git a/python/openai/openai/tests/test_openai_client.py b/python/openai/openai/tests/test_openai_client.py index 021fb5e883..f1c566cbf2 100644 --- a/python/openai/openai/tests/test_openai_client.py +++ b/python/openai/openai/tests/test_openai_client.py @@ -1,3 +1,29 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from typing import List import openai diff --git a/python/openai/openai/tests/utils.py b/python/openai/openai/tests/utils.py index 49136cfd5a..25816d77d8 100644 --- a/python/openai/openai/tests/utils.py +++ b/python/openai/openai/tests/utils.py @@ -1,3 +1,29 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import os import subprocess import sys From fc4c15a1c7970c7d2a512f65361fe780eafd3082 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Wed, 21 Aug 2024 17:45:42 -0700 Subject: [PATCH 53/80] Add disclaimer around application state and multiprocessing --- python/openai/openai/src/api_server.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/openai/openai/src/api_server.py b/python/openai/openai/src/api_server.py index 105da9c836..64288e27e3 100644 --- a/python/openai/openai/src/api_server.py +++ b/python/openai/openai/src/api_server.py @@ -54,9 +54,14 @@ def add_cors_middleware(app: FastAPI): @asynccontextmanager async def lifespan(app: FastAPI): - print("Starting FastAPI app lifespan...") # Start the tritonserver on FastAPI app startup + print("Starting FastAPI app lifespan...") server, model_metadatas = init_tritonserver() + + # NOTE: These are meant for read-only access by routes handling requests + # with a single process, and should generally not be modified for the + # lifetime of the application. If multiple uvicorn workers are instantiated, + # then multiple triton servers would be started, one per worker process. app.server = server app.models = {metadata.name: metadata for metadata in model_metadatas} From 1ca988916233730038418a62077a48287a9c410e Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Wed, 21 Aug 2024 18:20:33 -0700 Subject: [PATCH 54/80] Address CodeQL warnings --- .gitignore | 2 +- python/openai/openai/src/utils/triton.py | 2 +- python/openai/openai/tests/conftest.py | 6 +++-- .../openai/tests/test_chat_completions.py | 2 +- .../openai/openai/tests/test_completions.py | 2 +- .../openai/openai/tests/test_observability.py | 22 +++++++++++-------- 6 files changed, 21 insertions(+), 15 deletions(-) diff --git a/.gitignore b/.gitignore index 02be5ecddc..7974ad5fa7 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,4 @@ artifacts # Test exclusions qa/L0_openai/openai -qa/L0_openai/tensorrtllm_models +tensorrtllm_models diff --git a/python/openai/openai/src/utils/triton.py b/python/openai/openai/src/utils/triton.py index 966c1c2c02..2ff239849b 100644 --- a/python/openai/openai/src/utils/triton.py +++ b/python/openai/openai/src/utils/triton.py @@ -139,7 +139,7 @@ def get_output(response): if "text_output" in response.outputs: try: return response.outputs["text_output"].to_string_array()[0] - except: + except Exception: return str(response.outputs["text_output"].to_bytes_array()[0]) return "" diff --git a/python/openai/openai/tests/conftest.py b/python/openai/openai/tests/conftest.py index e00d9157bd..4952beaef0 100644 --- a/python/openai/openai/tests/conftest.py +++ b/python/openai/openai/tests/conftest.py @@ -36,13 +36,15 @@ TEST_PROMPT = "What is machine learning?" TEST_MESSAGES = [{"role": "user", "content": TEST_PROMPT}] TEST_TOKENIZER = "meta-llama/Meta-Llama-3.1-8B-Instruct" + +# Infer the test environment for simplicity in local dev/testing. try: import vllm as _ TEST_BACKEND = "vllm" TEST_MODEL = "llama-3.1-8b-instruct" except ImportError: - pass + print("No vllm installation found.") try: import tensorrt_llm as _ @@ -50,7 +52,7 @@ TEST_BACKEND = "tensorrtllm" TEST_MODEL = "tensorrt_llm_bls" except ImportError: - pass + print("No tensorrt_llm installation found.") if not TEST_BACKEND or not TEST_MODEL: raise Exception("Unknown test environment") diff --git a/python/openai/openai/tests/test_chat_completions.py b/python/openai/openai/tests/test_chat_completions.py index f2fbfbef74..cf0b278c38 100644 --- a/python/openai/openai/tests/test_chat_completions.py +++ b/python/openai/openai/tests/test_chat_completions.py @@ -49,7 +49,7 @@ def test_chat_completions_defaults(self, client, model: str, messages: List[dict assert message["content"].strip() assert message["role"] == "assistant" # "usage" currently not supported - assert response.json()["usage"] == None + assert not response.json()["usage"] def test_chat_completions_system_prompt(self, client, model: str): # NOTE: Currently just sanity check that there are no issues when a diff --git a/python/openai/openai/tests/test_completions.py b/python/openai/openai/tests/test_completions.py index ce0e5d2c92..13b7d99686 100644 --- a/python/openai/openai/tests/test_completions.py +++ b/python/openai/openai/tests/test_completions.py @@ -46,7 +46,7 @@ def test_completions_defaults(self, client, model: str, prompt: str): # or tested with dummy identity model. assert response.json()["choices"][0]["text"].strip() # "usage" currently not supported - assert response.json()["usage"] == None + assert not response.json()["usage"] @pytest.mark.parametrize( "sampling_parameter, value", diff --git a/python/openai/openai/tests/test_observability.py b/python/openai/openai/tests/test_observability.py index 7158e10b19..8839f6248c 100644 --- a/python/openai/openai/tests/test_observability.py +++ b/python/openai/openai/tests/test_observability.py @@ -57,15 +57,6 @@ def test_startup_success(self, client): response = client.get("/health") assert response.status_code == 200 - def test_startup_fail(self): - os.environ["TRITON_MODEL_REPOSITORY"] = "/does/not/exist" - with pytest.raises(Exception): - # Test that FastAPI lifespan startup fails when initializing Triton - # with unknown model repository. - app = init_app() - with TestClient(app): - pass - ### Metrics ### def test_startup_metrics(self, client): response = client.get("/metrics") @@ -96,3 +87,16 @@ def test_models_get(self, client, model): assert model_resp["object"] == "model" assert model_resp["created"] > 0 assert model_resp["owned_by"] == "Triton Inference Server" + + +# For tests that won't use the same pytest fixture for server startup across +# the whole class test suite. +class TestObservabilityCustomFixture: + def test_startup_fail(self): + os.environ["TRITON_MODEL_REPOSITORY"] = "/does/not/exist" + with pytest.raises(Exception): + # Test that FastAPI lifespan startup fails when initializing Triton + # with unknown model repository. + app = init_app() + with TestClient(app): + pass From 92a27e512b1c17e2ac3a7b7de17ead942fb8aedb Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Fri, 23 Aug 2024 11:36:23 -0700 Subject: [PATCH 55/80] Add quickstart vllm dockerfile for sharing purposes --- python/openai/README.md | 2 -- python/openai/docker/Dockerfile.vllm | 12 +++++++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/python/openai/README.md b/python/openai/README.md index c48c151637..f6c245d7cf 100644 --- a/python/openai/README.md +++ b/python/openai/README.md @@ -20,10 +20,8 @@ docker build -t tritonserver-openai-vllm -f docker/Dockerfile.vllm . docker run -it --net=host --gpus all --rm \ - -v ${PWD}:/workspace \ -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \ -e HF_TOKEN \ - -w /workspace \ tritonserver-openai-vllm ``` diff --git a/python/openai/docker/Dockerfile.vllm b/python/openai/docker/Dockerfile.vllm index 5be42b4fbe..201e8ca2e6 100644 --- a/python/openai/docker/Dockerfile.vllm +++ b/python/openai/docker/Dockerfile.vllm @@ -29,6 +29,12 @@ FROM ${BASE_IMAGE} RUN pip install /opt/tritonserver/python/*.whl -COPY requirements.txt requirements_vllm.txt /tmp -RUN pip install -r /tmp/requirements.txt && \ - pip install -r /tmp/requirements_vllm.txt +# TODO: Update along with other folder/structure changes in review comments +WORKDIR /workspace +RUN git clone --single-branch -b rmccormick-openai https://github.com/triton-inference-server/server.git && \ + pip install -r server/python/openai/docker/requirements.txt && \ + pip install -r server/python/openai/docker/requirements_vllm.txt && \ + mv server/python/openai/openai ./openai && \ + mv server/python/openai/examples ./openai && \ + mv server/python/openai/README.md ./openai && \ + rm -r server From 9c3ee15d39454685c2d325dc1519da0bf5afe365 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Fri, 23 Aug 2024 11:45:11 -0700 Subject: [PATCH 56/80] Remove workspace mount mention --- python/openai/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/python/openai/README.md b/python/openai/README.md index f6c245d7cf..93a7d5b593 100644 --- a/python/openai/README.md +++ b/python/openai/README.md @@ -11,7 +11,6 @@ ## VLLM 1. Build and launch the container: - - Mounts the openai source files to `/workspace` for simplicity, later on these will be shipped in the container. - Mounts the `~/.huggingface/cache` for re-use of downloaded models across runs, containers, etc. - Sets the [`HF_TOKEN`](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hftoken) environment variable to access gated models, make sure this is set in your local environment if needed. From 886ee7d212a8f6774d6dfefdeffeb328f32fad24 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Fri, 23 Aug 2024 15:49:42 -0700 Subject: [PATCH 57/80] Review feedback: rename package, move tests out of package, remove nested src/ dir in package --- python/openai/README.md | 10 +++++----- python/openai/docker/Dockerfile.vllm | 2 +- .../{openai/src => openai_frontend}/__init__.py | 0 .../src/api_server.py => openai_frontend/app.py} | 4 ++-- python/openai/{openai => openai_frontend}/main.py | 7 +++---- .../src => openai_frontend}/routers/__init__.py | 0 .../routers/chat_completions.py | 13 ++++--------- .../src => openai_frontend}/routers/completions.py | 4 ++-- .../src => openai_frontend}/routers/models.py | 2 +- .../routers/observability.py | 0 .../src => openai_frontend}/schemas/__init__.py | 0 .../src => openai_frontend}/schemas/openai.py | 0 .../src => openai_frontend}/utils/__init__.py | 0 .../src => openai_frontend}/utils/tokenizer.py | 0 .../{openai/src => openai_frontend}/utils/triton.py | 10 +++++----- python/openai/{openai => }/tests/__init__.py | 0 python/openai/{openai => }/tests/conftest.py | 0 .../{openai => }/tests/test_chat_completions.py | 0 .../openai/{openai => }/tests/test_completions.py | 0 .../tests/test_models/identity_py/1/model.py | 0 .../tests/test_models/identity_py/config.pbtxt | 0 .../tests/test_models/mock_llm/1/model.py | 0 .../tests/test_models/mock_llm/config.pbtxt | 0 .../openai/{openai => }/tests/test_observability.py | 5 ++--- .../openai/{openai => }/tests/test_openai_client.py | 0 python/openai/{openai => }/tests/utils.py | 6 ++++-- .../vllm_models/llama-3.1-8b-instruct/1/model.json | 0 .../vllm_models/llama-3.1-8b-instruct/config.pbtxt | 0 28 files changed, 29 insertions(+), 34 deletions(-) rename python/openai/{openai/src => openai_frontend}/__init__.py (100%) rename python/openai/{openai/src/api_server.py => openai_frontend/app.py} (96%) rename python/openai/{openai => openai_frontend}/main.py (95%) rename python/openai/{openai/src => openai_frontend}/routers/__init__.py (100%) rename python/openai/{openai/src => openai_frontend}/routers/chat_completions.py (94%) rename python/openai/{openai/src => openai_frontend}/routers/completions.py (98%) rename python/openai/{openai/src => openai_frontend}/routers/models.py (98%) rename python/openai/{openai/src => openai_frontend}/routers/observability.py (100%) rename python/openai/{openai/src => openai_frontend}/schemas/__init__.py (100%) rename python/openai/{openai/src => openai_frontend}/schemas/openai.py (100%) rename python/openai/{openai/src => openai_frontend}/utils/__init__.py (100%) rename python/openai/{openai/src => openai_frontend}/utils/tokenizer.py (100%) rename python/openai/{openai/src => openai_frontend}/utils/triton.py (97%) rename python/openai/{openai => }/tests/__init__.py (100%) rename python/openai/{openai => }/tests/conftest.py (100%) rename python/openai/{openai => }/tests/test_chat_completions.py (100%) rename python/openai/{openai => }/tests/test_completions.py (100%) rename python/openai/{openai => }/tests/test_models/identity_py/1/model.py (100%) rename python/openai/{openai => }/tests/test_models/identity_py/config.pbtxt (100%) rename python/openai/{openai => }/tests/test_models/mock_llm/1/model.py (100%) rename python/openai/{openai => }/tests/test_models/mock_llm/config.pbtxt (100%) rename python/openai/{openai => }/tests/test_observability.py (96%) rename python/openai/{openai => }/tests/test_openai_client.py (100%) rename python/openai/{openai => }/tests/utils.py (96%) rename python/openai/{openai => }/tests/vllm_models/llama-3.1-8b-instruct/1/model.json (100%) rename python/openai/{openai => }/tests/vllm_models/llama-3.1-8b-instruct/config.pbtxt (100%) diff --git a/python/openai/README.md b/python/openai/README.md index 93a7d5b593..f6f26664e3 100644 --- a/python/openai/README.md +++ b/python/openai/README.md @@ -24,10 +24,10 @@ docker run -it --net=host --gpus all --rm \ tritonserver-openai-vllm ``` -2. Launch the OpenAI server: +2. Launch the OpenAI-compatible Triton Inference Server: ```bash # NOTE: Adjust the --tokenizer based on the model being used -python3 openai/main.py --model-repository openai/tests/vllm_models/ --tokenizer meta-llama/Meta-Llama-3.1-8B-Instruct +python3 openai_frontend/main.py --model-repository tests/vllm_models/ --tokenizer meta-llama/Meta-Llama-3.1-8B-Instruct ``` 3. Send a `/v1/chat/completions` request: @@ -93,7 +93,7 @@ print(completion.choices[0].message.content) 7. Run tests (NOTE: The server should not be running, the tests will handle starting/stopping the server as necessary): ```bash -cd openai/tests/ +cd tests/ pytest -v ``` @@ -115,7 +115,7 @@ at the model repository accordingly when following the examples. access gated models, make sure this is set in your local environment if needed. ```bash -docker build -t tritonserver-openai-vllm -f docker/Dockerfile.tensorrtllm . +docker build -t tritonserver-openai-tensorrtllm -f docker/Dockerfile.tensorrtllm ./docker docker run -it --net=host --gpus all --rm \ -v ${PWD}:/workspace \ @@ -128,7 +128,7 @@ docker run -it --net=host --gpus all --rm \ 2. Launch the OpenAI server: ```bash # NOTE: Adjust the --tokenizer based on the model being used -python3 openai/main.py --model-repository openai/tests/tensorrtllm_models/ --tokenizer meta-llama/Meta-Llama-3.1-8B-Instruct +python3 openai_frontend/main.py --model-repository tests/tensorrtllm_models/ --tokenizer meta-llama/Meta-Llama-3.1-8B-Instruct ``` 3. Send a `/v1/chat/completions` request: diff --git a/python/openai/docker/Dockerfile.vllm b/python/openai/docker/Dockerfile.vllm index 201e8ca2e6..8a04d63484 100644 --- a/python/openai/docker/Dockerfile.vllm +++ b/python/openai/docker/Dockerfile.vllm @@ -34,7 +34,7 @@ WORKDIR /workspace RUN git clone --single-branch -b rmccormick-openai https://github.com/triton-inference-server/server.git && \ pip install -r server/python/openai/docker/requirements.txt && \ pip install -r server/python/openai/docker/requirements_vllm.txt && \ - mv server/python/openai/openai ./openai && \ + mv server/python/openai/openai_frontend ./openai && \ mv server/python/openai/examples ./openai && \ mv server/python/openai/README.md ./openai && \ rm -r server diff --git a/python/openai/openai/src/__init__.py b/python/openai/openai_frontend/__init__.py similarity index 100% rename from python/openai/openai/src/__init__.py rename to python/openai/openai_frontend/__init__.py diff --git a/python/openai/openai/src/api_server.py b/python/openai/openai_frontend/app.py similarity index 96% rename from python/openai/openai/src/api_server.py rename to python/openai/openai_frontend/app.py index 64288e27e3..ec2aa49f11 100644 --- a/python/openai/openai/src/api_server.py +++ b/python/openai/openai_frontend/app.py @@ -31,8 +31,8 @@ import tritonserver from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware -from src.routers import chat_completions, completions, models, observability -from src.utils.triton import init_tritonserver +from routers import chat_completions, completions, models, observability +from utils.triton import init_tritonserver def add_cors_middleware(app: FastAPI): diff --git a/python/openai/openai/main.py b/python/openai/openai_frontend/main.py similarity index 95% rename from python/openai/openai/main.py rename to python/openai/openai_frontend/main.py index eb28bad132..a95d59916f 100755 --- a/python/openai/openai/main.py +++ b/python/openai/openai_frontend/main.py @@ -30,7 +30,7 @@ import os import uvicorn -from src.api_server import init_app +from app import init_app def parse_args(): @@ -60,7 +60,7 @@ def parse_args(): triton_group.add_argument( "--model-repository", type=str, - default=None, + required=True, help="Path to the Triton model repository holding the models to be served", ) triton_group.add_argument( @@ -77,8 +77,7 @@ def parse_args(): args = parse_args() # NOTE: configurations can be passed to FastAPI app through a builder # function like init_app in future, but using env vars for simplicity. - if args.model_repository: - os.environ["TRITON_MODEL_REPOSITORY"] = args.model_repository + os.environ["TRITON_MODEL_REPOSITORY"] = args.model_repository if args.tokenizer: os.environ["TOKENIZER"] = args.tokenizer diff --git a/python/openai/openai/src/routers/__init__.py b/python/openai/openai_frontend/routers/__init__.py similarity index 100% rename from python/openai/openai/src/routers/__init__.py rename to python/openai/openai_frontend/routers/__init__.py diff --git a/python/openai/openai/src/routers/chat_completions.py b/python/openai/openai_frontend/routers/chat_completions.py similarity index 94% rename from python/openai/openai/src/routers/chat_completions.py rename to python/openai/openai_frontend/routers/chat_completions.py index 1a3c61cecd..c4e6445392 100644 --- a/python/openai/openai/src/routers/chat_completions.py +++ b/python/openai/openai_frontend/routers/chat_completions.py @@ -29,7 +29,7 @@ from fastapi import APIRouter, HTTPException, Request from fastapi.responses import StreamingResponse -from src.schemas.openai import ( +from schemas.openai import ( ChatCompletionChoice, ChatCompletionFinishReason, ChatCompletionResponseMessage, @@ -40,7 +40,7 @@ CreateChatCompletionStreamResponse, ObjectType, ) -from src.utils.triton import get_output, validate_triton_responses +from utils.triton import get_output, validate_triton_responses router = APIRouter() @@ -128,13 +128,8 @@ def create_chat_completion( if not metadata.backend: raise HTTPException(status_code=400, detail="Unknown backend") - triton_model = raw_request.app.server.model(request.model) - if request.model != triton_model.name: - raise HTTPException( - status_code=400, - detail=f"Mismatched model name: {request.model} != {triton_model.name}", - ) - + # TODO: Cleanup + triton_model = metadata.model if request.n and request.n > 1: raise HTTPException(status_code=400, detail="Only single choice is supported") diff --git a/python/openai/openai/src/routers/completions.py b/python/openai/openai_frontend/routers/completions.py similarity index 98% rename from python/openai/openai/src/routers/completions.py rename to python/openai/openai_frontend/routers/completions.py index 7fdd23a424..e58dda0459 100644 --- a/python/openai/openai/src/routers/completions.py +++ b/python/openai/openai_frontend/routers/completions.py @@ -29,14 +29,14 @@ from fastapi import APIRouter, HTTPException, Request from fastapi.responses import StreamingResponse -from src.schemas.openai import ( +from schemas.openai import ( Choice, CreateCompletionRequest, CreateCompletionResponse, FinishReason, ObjectType, ) -from src.utils.triton import get_output, validate_triton_responses +from utils.triton import get_output, validate_triton_responses router = APIRouter() diff --git a/python/openai/openai/src/routers/models.py b/python/openai/openai_frontend/routers/models.py similarity index 98% rename from python/openai/openai/src/routers/models.py rename to python/openai/openai_frontend/routers/models.py index 14cda956fe..b426dfcc15 100644 --- a/python/openai/openai/src/routers/models.py +++ b/python/openai/openai_frontend/routers/models.py @@ -25,7 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from fastapi import APIRouter, HTTPException, Request -from src.schemas.openai import ListModelsResponse, Model, ObjectType +from schemas.openai import ListModelsResponse, Model, ObjectType router = APIRouter() diff --git a/python/openai/openai/src/routers/observability.py b/python/openai/openai_frontend/routers/observability.py similarity index 100% rename from python/openai/openai/src/routers/observability.py rename to python/openai/openai_frontend/routers/observability.py diff --git a/python/openai/openai/src/schemas/__init__.py b/python/openai/openai_frontend/schemas/__init__.py similarity index 100% rename from python/openai/openai/src/schemas/__init__.py rename to python/openai/openai_frontend/schemas/__init__.py diff --git a/python/openai/openai/src/schemas/openai.py b/python/openai/openai_frontend/schemas/openai.py similarity index 100% rename from python/openai/openai/src/schemas/openai.py rename to python/openai/openai_frontend/schemas/openai.py diff --git a/python/openai/openai/src/utils/__init__.py b/python/openai/openai_frontend/utils/__init__.py similarity index 100% rename from python/openai/openai/src/utils/__init__.py rename to python/openai/openai_frontend/utils/__init__.py diff --git a/python/openai/openai/src/utils/tokenizer.py b/python/openai/openai_frontend/utils/tokenizer.py similarity index 100% rename from python/openai/openai/src/utils/tokenizer.py rename to python/openai/openai_frontend/utils/tokenizer.py diff --git a/python/openai/openai/src/utils/triton.py b/python/openai/openai_frontend/utils/triton.py similarity index 97% rename from python/openai/openai/src/utils/triton.py rename to python/openai/openai_frontend/utils/triton.py index 2ff239849b..34a773b463 100644 --- a/python/openai/openai/src/utils/triton.py +++ b/python/openai/openai_frontend/utils/triton.py @@ -32,8 +32,8 @@ import numpy as np import tritonserver from fastapi import HTTPException -from src.schemas.openai import CreateChatCompletionRequest, CreateCompletionRequest -from src.utils.tokenizer import get_tokenizer +from schemas.openai import CreateChatCompletionRequest, CreateCompletionRequest +from utils.tokenizer import get_tokenizer # TODO: Stricter pydantic validation would be better in future @@ -68,7 +68,7 @@ def determine_request_format(backend): def load_models(server): - model_metadatas = [] + model_metadata = [] backends = [] # TODO: Consider support for custom tokenizers @@ -109,9 +109,9 @@ def load_models(server): create_time=create_time, request_convert_fn=determine_request_format(backend), ) - model_metadatas.append(metadata) + model_metadata.append(metadata) - return model_metadatas + return model_metadata def init_tritonserver(): diff --git a/python/openai/openai/tests/__init__.py b/python/openai/tests/__init__.py similarity index 100% rename from python/openai/openai/tests/__init__.py rename to python/openai/tests/__init__.py diff --git a/python/openai/openai/tests/conftest.py b/python/openai/tests/conftest.py similarity index 100% rename from python/openai/openai/tests/conftest.py rename to python/openai/tests/conftest.py diff --git a/python/openai/openai/tests/test_chat_completions.py b/python/openai/tests/test_chat_completions.py similarity index 100% rename from python/openai/openai/tests/test_chat_completions.py rename to python/openai/tests/test_chat_completions.py diff --git a/python/openai/openai/tests/test_completions.py b/python/openai/tests/test_completions.py similarity index 100% rename from python/openai/openai/tests/test_completions.py rename to python/openai/tests/test_completions.py diff --git a/python/openai/openai/tests/test_models/identity_py/1/model.py b/python/openai/tests/test_models/identity_py/1/model.py similarity index 100% rename from python/openai/openai/tests/test_models/identity_py/1/model.py rename to python/openai/tests/test_models/identity_py/1/model.py diff --git a/python/openai/openai/tests/test_models/identity_py/config.pbtxt b/python/openai/tests/test_models/identity_py/config.pbtxt similarity index 100% rename from python/openai/openai/tests/test_models/identity_py/config.pbtxt rename to python/openai/tests/test_models/identity_py/config.pbtxt diff --git a/python/openai/openai/tests/test_models/mock_llm/1/model.py b/python/openai/tests/test_models/mock_llm/1/model.py similarity index 100% rename from python/openai/openai/tests/test_models/mock_llm/1/model.py rename to python/openai/tests/test_models/mock_llm/1/model.py diff --git a/python/openai/openai/tests/test_models/mock_llm/config.pbtxt b/python/openai/tests/test_models/mock_llm/config.pbtxt similarity index 100% rename from python/openai/openai/tests/test_models/mock_llm/config.pbtxt rename to python/openai/tests/test_models/mock_llm/config.pbtxt diff --git a/python/openai/openai/tests/test_observability.py b/python/openai/tests/test_observability.py similarity index 96% rename from python/openai/openai/tests/test_observability.py rename to python/openai/tests/test_observability.py index 8839f6248c..c2b7ae2e3d 100644 --- a/python/openai/openai/tests/test_observability.py +++ b/python/openai/tests/test_observability.py @@ -29,7 +29,7 @@ import pytest from fastapi.testclient import TestClient -from src.api_server import init_app +from tests.utils import setup_fastapi_app # Override conftest.py default model @@ -42,8 +42,7 @@ class TestObservability: @pytest.fixture(scope="class") def client(self): model_repository = Path(__file__).parent / "test_models" - os.environ["TRITON_MODEL_REPOSITORY"] = str(model_repository) - app = init_app() + app = setup_fastapi_app(tokenizer="", model_repository=str(model_repository)) with TestClient(app) as test_client: yield test_client diff --git a/python/openai/openai/tests/test_openai_client.py b/python/openai/tests/test_openai_client.py similarity index 100% rename from python/openai/openai/tests/test_openai_client.py rename to python/openai/tests/test_openai_client.py diff --git a/python/openai/openai/tests/utils.py b/python/openai/tests/utils.py similarity index 96% rename from python/openai/openai/tests/utils.py rename to python/openai/tests/utils.py index 25816d77d8..2a1633a71d 100644 --- a/python/openai/openai/tests/utils.py +++ b/python/openai/tests/utils.py @@ -33,7 +33,9 @@ import openai import requests -from src.api_server import init_app + +sys.path.append(os.path.join("..", "openai_frontend")) +from openai_frontend.app import init_app def setup_fastapi_app(tokenizer: str, model_repository: str): @@ -62,7 +64,7 @@ def __init__( env.update(env_dict) this_dir = Path(__file__).resolve().parent - script_path = this_dir / ".." / "main.py" + script_path = this_dir / ".." / "openai_frontend" / "main.py" self.proc = subprocess.Popen( ["python3", script_path] + cli_args, env=env, diff --git a/python/openai/openai/tests/vllm_models/llama-3.1-8b-instruct/1/model.json b/python/openai/tests/vllm_models/llama-3.1-8b-instruct/1/model.json similarity index 100% rename from python/openai/openai/tests/vllm_models/llama-3.1-8b-instruct/1/model.json rename to python/openai/tests/vllm_models/llama-3.1-8b-instruct/1/model.json diff --git a/python/openai/openai/tests/vllm_models/llama-3.1-8b-instruct/config.pbtxt b/python/openai/tests/vllm_models/llama-3.1-8b-instruct/config.pbtxt similarity index 100% rename from python/openai/openai/tests/vllm_models/llama-3.1-8b-instruct/config.pbtxt rename to python/openai/tests/vllm_models/llama-3.1-8b-instruct/config.pbtxt From 21c099630721a0045aaef8d7239a191808bded99 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Fri, 23 Aug 2024 17:11:48 -0700 Subject: [PATCH 58/80] Review feedback: naming nits, more type hints, helper functions --- .../routers/chat_completions.py | 74 +++++++++++-------- .../openai_frontend/routers/completions.py | 53 +++++++------ .../openai/openai_frontend/routers/models.py | 14 ++-- python/openai/openai_frontend/utils/triton.py | 22 +++--- python/openai/tests/utils.py | 2 +- 5 files changed, 93 insertions(+), 72 deletions(-) diff --git a/python/openai/openai_frontend/routers/chat_completions.py b/python/openai/openai_frontend/routers/chat_completions.py index c4e6445392..aa608f72be 100644 --- a/python/openai/openai_frontend/routers/chat_completions.py +++ b/python/openai/openai_frontend/routers/chat_completions.py @@ -26,6 +26,7 @@ import time import uuid +from typing import Dict, List from fastapi import APIRouter, HTTPException, Request from fastapi.responses import StreamingResponse @@ -40,19 +41,24 @@ CreateChatCompletionStreamResponse, ObjectType, ) -from utils.triton import get_output, validate_triton_responses +from utils.triton import TritonModelMetadata, get_output, validate_triton_responses router = APIRouter() -def get_first_response_role(conversation, add_generation_prompt, default_role): +# TODO: This behavior should be tested further +def _get_first_response_role( + conversation: List[Dict], add_generation_prompt: bool, default_role: str +) -> str: if add_generation_prompt: return default_role return conversation[-1]["role"] -def streaming_chat_completion_response(request_id, created, model, role, responses): +def _streaming_chat_completion_response( + request_id: str, created: int, model: str, role: str, responses: List +) -> str: # first chunk choice = ChatCompletionStreamingResponseChoice( index=0, @@ -98,26 +104,17 @@ def streaming_chat_completion_response(request_id, created, model, role, respons yield "data: [DONE]\n\n" -@router.post( - "/v1/chat/completions", response_model=CreateChatCompletionResponse, tags=["Chat"] -) -def create_chat_completion( - request: CreateChatCompletionRequest, - raw_request: Request, -) -> CreateChatCompletionResponse | StreamingResponse: +def _validate_chat_request( + request: CreateChatCompletionRequest, metadata: TritonModelMetadata +): """ - Creates a model response for the given chat conversation. + Validates a chat completions request to align with currently supported features. """ - model_metadatas = raw_request.app.models - if not model_metadatas: - raise HTTPException(status_code=400, detail="No known models") - - metadata = model_metadatas.get(request.model) if not metadata: raise HTTPException(status_code=400, detail=f"Unknown model: {request.model}") - if not metadata.request_convert_fn: + if not metadata.request_converter: raise HTTPException( status_code=400, detail=f"Unknown request format for model: {request.model}" ) @@ -128,8 +125,6 @@ def create_chat_completion( if not metadata.backend: raise HTTPException(status_code=400, detail="Unknown backend") - # TODO: Cleanup - triton_model = metadata.model if request.n and request.n > 1: raise HTTPException(status_code=400, detail="Only single choice is supported") @@ -138,33 +133,52 @@ def create_chat_completion( status_code=400, detail="logit bias and log probs not supported" ) + +@router.post( + "/v1/chat/completions", response_model=CreateChatCompletionResponse, tags=["Chat"] +) +def create_chat_completion( + request: CreateChatCompletionRequest, + raw_request: Request, +) -> CreateChatCompletionResponse | StreamingResponse: + """ + Creates a model response for the given chat conversation. + """ + + metadata = raw_request.app.models.get(request.model) + _validate_chat_request(request, metadata) + + # TODO: Move conversation/role bits into helper + + # Prepare prompt with chat template + # TODO: Does this need to be exposed to the user? + add_generation_prompt = True conversation = [ {"role": str(message.role), "content": str(message.content)} for message in request.messages ] - # NOTE: This behavior should be tested further - # TODO: Do these need to be exposed to the user? - add_generation_prompt = True - default_role = "assistant" - role = get_first_response_role(conversation, add_generation_prompt, default_role) - prompt = metadata.tokenizer.apply_chat_template( conversation=conversation, tokenize=False, add_generation_prompt=add_generation_prompt, ) - request_id = f"cmpl-{uuid.uuid1()}" - created = int(time.time()) - + # Convert to Triton request format and perform inference + triton_model = metadata.model responses = triton_model.infer( - metadata.request_convert_fn(triton_model, prompt, request) + metadata.request_converter(triton_model, prompt, request) ) + # Prepare and send responses back to client in OpenAI format + request_id = f"cmpl-{uuid.uuid1()}" + created = int(time.time()) + default_role = "assistant" + role = _get_first_response_role(conversation, add_generation_prompt, default_role) + if request.stream: return StreamingResponse( - streaming_chat_completion_response( + _streaming_chat_completion_response( request_id, created, request.model, role, responses ), media_type="text/event-stream", diff --git a/python/openai/openai_frontend/routers/completions.py b/python/openai/openai_frontend/routers/completions.py index e58dda0459..3eb0ee07f9 100644 --- a/python/openai/openai_frontend/routers/completions.py +++ b/python/openai/openai_frontend/routers/completions.py @@ -26,6 +26,7 @@ import time import uuid +from typing import List from fastapi import APIRouter, HTTPException, Request from fastapi.responses import StreamingResponse @@ -36,12 +37,14 @@ FinishReason, ObjectType, ) -from utils.triton import get_output, validate_triton_responses +from utils.triton import TritonModelMetadata, get_output, validate_triton_responses router = APIRouter() -def streaming_completion_response(request_id, created, model, responses): +def _streaming_completion_response( + request_id: str, created: int, model: str, responses: List +) -> str: for response in responses: text = get_output(response) @@ -64,29 +67,18 @@ def streaming_completion_response(request_id, created, model, responses): yield "data: [DONE]\n\n" -@router.post( - "/v1/completions", response_model=CreateCompletionResponse, tags=["Completions"] -) -def create_completion( - request: CreateCompletionRequest, raw_request: Request -) -> CreateCompletionResponse | StreamingResponse: +def _validate_completions_request( + request: CreateCompletionRequest, metadata: TritonModelMetadata +): """ - Creates a completion for the provided prompt and parameters. + Validates a completions request to align with currently supported features. """ - - if not request.model: - raise Exception("Request must provide a valid 'model'") - - print(f"[DEBUG] Available model metadata: {raw_request.app.models.keys()=}") - print(f"[DEBUG] Fetching model metadata for {request.model=}") - metadata = raw_request.app.models.get(request.model) - if not metadata: raise HTTPException( status_code=400, detail=f"Unknown model metadata for model: {request.model}" ) - if not metadata.request_convert_fn: + if not metadata.request_converter: raise HTTPException( status_code=400, detail=f"Unknown request format for model: {request.model}" ) @@ -114,16 +106,33 @@ def create_completion( status_code=400, detail="logit bias and log probs not supported" ) - request_id = f"cmpl-{uuid.uuid1()}" - created = int(time.time()) +@router.post( + "/v1/completions", response_model=CreateCompletionResponse, tags=["Completions"] +) +def create_completion( + request: CreateCompletionRequest, raw_request: Request +) -> CreateCompletionResponse | StreamingResponse: + """ + Creates a completion for the provided prompt and parameters. + """ + + # Validate request and convert to Triton format + metadata = raw_request.app.models.get(request.model) + _validate_completions_request(request, metadata) + + # Convert to Triton request format and perform inference triton_model = raw_request.app.server.model(request.model) responses = triton_model.infer( - metadata.request_convert_fn(triton_model, request.prompt, request) + metadata.request_converter(triton_model, request.prompt, request) ) + + # Prepare and send responses back to client in OpenAI format + request_id = f"cmpl-{uuid.uuid1()}" + created = int(time.time()) if request.stream: return StreamingResponse( - streaming_completion_response( + _streaming_completion_response( request_id, created, metadata.name, responses ), media_type="text/event-stream", diff --git a/python/openai/openai_frontend/routers/models.py b/python/openai/openai_frontend/routers/models.py index b426dfcc15..07c37549c7 100644 --- a/python/openai/openai_frontend/routers/models.py +++ b/python/openai/openai_frontend/routers/models.py @@ -37,13 +37,13 @@ def list_models(request: Request) -> ListModelsResponse: """ Lists the currently available models, and provides basic information about each one such as the owner and availability. """ - model_metadatas = request.app.models - if not model_metadatas: + model_metadata = request.app.models + if not model_metadata: raise HTTPException(status_code=400, detail="No known models") model_list = [] - for model in model_metadatas: - metadata = model_metadatas[model] + for model in model_metadata: + metadata = model_metadata[model] if not metadata: raise HTTPException( status_code=400, detail=f"No metadata for model: {model}" @@ -66,11 +66,11 @@ def retrieve_model(request: Request, model_name: str) -> Model: """ Retrieves a model instance, providing basic information about the model such as the owner and permissioning. """ - model_metadatas = request.app.models - if not model_metadatas: + model_metadata = request.app.models + if not model_metadata: raise HTTPException(status_code=400, detail="No known models") - model = model_metadatas.get(model_name) + model = model_metadata.get(model_name) if not model: raise HTTPException(status_code=400, detail=f"Unknown model: {model_name}") diff --git a/python/openai/openai_frontend/utils/triton.py b/python/openai/openai_frontend/utils/triton.py index 34a773b463..44823f0050 100644 --- a/python/openai/openai_frontend/utils/triton.py +++ b/python/openai/openai_frontend/utils/triton.py @@ -28,6 +28,7 @@ import time import typing from dataclasses import dataclass +from typing import Callable, Optional import numpy as np import tritonserver @@ -36,7 +37,7 @@ from utils.tokenizer import get_tokenizer -# TODO: Stricter pydantic validation would be better in future +# TODO: Improve type hints @dataclass class TritonModelMetadata: # Name used in Triton model repository @@ -46,25 +47,22 @@ class TritonModelMetadata: # Triton model object handle model: tritonserver.Model # Tokenizers used for chat templates - tokenizer: typing.Optional[typing.Any] + tokenizer: Optional[typing.Any] # Time that model was loaded by Triton create_time: int # Conversion format between OpenAI and Triton requests - request_convert_fn: typing.Optional[typing.Any] + request_converter: Callable # TODO: Expose explicit flag to catch edge cases -def determine_request_format(backend): +def determine_request_converter(backend): # Request conversion from OpenAI format to backend-specific format if backend == "vllm": - request_convert_fn = create_vllm_inference_request - # Python included to support TRT-LLM BLS model and TRT-LLM python runtime - elif backend in ["tensorrtllm", "python"]: - request_convert_fn = create_trtllm_inference_request - else: - request_convert_fn = None + return create_vllm_inference_request - return request_convert_fn + # Use TRT-LLM format as default for everything else. This could be + # an ensemble, a python or BLS model, a TRT-LLM backend model, etc. + return create_trtllm_inference_request def load_models(server): @@ -107,7 +105,7 @@ def load_models(server): model=model, tokenizer=tokenizer, create_time=create_time, - request_convert_fn=determine_request_format(backend), + request_converter=determine_request_converter(backend), ) model_metadata.append(metadata) diff --git a/python/openai/tests/utils.py b/python/openai/tests/utils.py index 2a1633a71d..d4e94c3991 100644 --- a/python/openai/tests/utils.py +++ b/python/openai/tests/utils.py @@ -34,7 +34,7 @@ import openai import requests -sys.path.append(os.path.join("..", "openai_frontend")) +sys.path.append(os.path.join(Path(__file__).resolve().parent, "..", "openai_frontend")) from openai_frontend.app import init_app From f84aec4b7b1ca48ba818738891cb9ccfcc4bd914 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Fri, 23 Aug 2024 17:19:07 -0700 Subject: [PATCH 59/80] Fix CodeQL import warning --- python/openai/openai_frontend/utils/triton.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/openai/openai_frontend/utils/triton.py b/python/openai/openai_frontend/utils/triton.py index 44823f0050..708a5a316a 100644 --- a/python/openai/openai_frontend/utils/triton.py +++ b/python/openai/openai_frontend/utils/triton.py @@ -26,9 +26,8 @@ import os import time -import typing from dataclasses import dataclass -from typing import Callable, Optional +from typing import Any, Callable, Optional import numpy as np import tritonserver @@ -47,7 +46,7 @@ class TritonModelMetadata: # Triton model object handle model: tritonserver.Model # Tokenizers used for chat templates - tokenizer: Optional[typing.Any] + tokenizer: Optional[Any] # Time that model was loaded by Triton create_time: int # Conversion format between OpenAI and Triton requests From b230697564393d1bd5705de0c7e8310c14d466c4 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Wed, 28 Aug 2024 19:14:03 -0700 Subject: [PATCH 60/80] refactor: Use thinner API server with an engine interface (#7570) --- python/openai/README.md | 11 +- python/openai/docker/Dockerfile.tensorrtllm | 2 +- python/openai/docker/Dockerfile.vllm | 9 +- python/openai/docker/requirements_vllm.txt | 3 - python/openai/openai_frontend/app.py | 103 ----- .../openai/openai_frontend/engine/__init__.py | 0 .../openai/openai_frontend/engine/engine.py | 94 ++++ .../openai_frontend/engine/triton_engine.py | 413 ++++++++++++++++++ .../openai_frontend/frontend/__init__.py | 0 .../frontend/fastapi/__init__py | 0 .../fastapi}/routers/__init__.py | 0 .../frontend/fastapi/routers/chat.py | 53 +++ .../frontend/fastapi/routers/completions.py | 52 +++ .../{ => frontend/fastapi}/routers/models.py | 50 +-- .../fastapi}/routers/observability.py | 23 +- .../frontend/fastapi_frontend.py | 107 +++++ .../openai_frontend/frontend/frontend.py | 43 ++ python/openai/openai_frontend/main.py | 63 ++- .../routers/chat_completions.py | 209 --------- .../openai_frontend/routers/completions.py | 160 ------- .../openai/openai_frontend/schemas/openai.py | 18 + python/openai/openai_frontend/utils/triton.py | 157 ++----- python/openai/tests/conftest.py | 6 +- python/openai/tests/test_chat_completions.py | 15 +- python/openai/tests/test_completions.py | 6 + python/openai/tests/test_observability.py | 26 +- python/openai/tests/utils.py | 25 +- qa/L0_openai/test.sh | 8 +- 28 files changed, 940 insertions(+), 716 deletions(-) delete mode 100644 python/openai/docker/requirements_vllm.txt delete mode 100644 python/openai/openai_frontend/app.py create mode 100644 python/openai/openai_frontend/engine/__init__.py create mode 100644 python/openai/openai_frontend/engine/engine.py create mode 100644 python/openai/openai_frontend/engine/triton_engine.py create mode 100644 python/openai/openai_frontend/frontend/__init__.py create mode 100644 python/openai/openai_frontend/frontend/fastapi/__init__py rename python/openai/openai_frontend/{ => frontend/fastapi}/routers/__init__.py (100%) create mode 100644 python/openai/openai_frontend/frontend/fastapi/routers/chat.py create mode 100644 python/openai/openai_frontend/frontend/fastapi/routers/completions.py rename python/openai/openai_frontend/{ => frontend/fastapi}/routers/models.py (68%) rename python/openai/openai_frontend/{ => frontend/fastapi}/routers/observability.py (74%) create mode 100644 python/openai/openai_frontend/frontend/fastapi_frontend.py create mode 100644 python/openai/openai_frontend/frontend/frontend.py delete mode 100644 python/openai/openai_frontend/routers/chat_completions.py delete mode 100644 python/openai/openai_frontend/routers/completions.py diff --git a/python/openai/README.md b/python/openai/README.md index f6f26664e3..58ec3e7ac5 100644 --- a/python/openai/README.md +++ b/python/openai/README.md @@ -5,7 +5,7 @@ 1. Docker + NVIDIA Container Runtime 2. A correctly configured `HF_TOKEN` for access to HuggingFace models. - The current examples and testing primarily use the - [`meta-llama/Meta-Llama-3-8B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) + [`meta-llama/Meta-Llama-3.1-8B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) model, but you can manually bring your own models and adjust accordingly. ## VLLM @@ -26,6 +26,8 @@ docker run -it --net=host --gpus all --rm \ 2. Launch the OpenAI-compatible Triton Inference Server: ```bash +cd openai/ + # NOTE: Adjust the --tokenizer based on the model being used python3 openai_frontend/main.py --model-repository tests/vllm_models/ --tokenizer meta-llama/Meta-Llama-3.1-8B-Instruct ``` @@ -53,7 +55,7 @@ curl -s http://localhost:8000/v1/completions -H 'Content-Type: application/json' 5. Benchmark with `genai-perf`: ```bash MODEL="llama-3.1-8b-instruct" -TOKENIZER="meta-llama/Meta-Llama-3-8B-Instruct" +TOKENIZER="meta-llama/Meta-Llama-3.1-8B-Instruct" genai-perf \ --model ${MODEL} \ --tokenizer ${TOKENIZER} \ @@ -93,8 +95,7 @@ print(completion.choices[0].message.content) 7. Run tests (NOTE: The server should not be running, the tests will handle starting/stopping the server as necessary): ```bash -cd tests/ -pytest -v +pytest -v tests/ ``` 8. For a list of examples, see the `examples/` folder. @@ -127,6 +128,8 @@ docker run -it --net=host --gpus all --rm \ 2. Launch the OpenAI server: ```bash +cd openai/ + # NOTE: Adjust the --tokenizer based on the model being used python3 openai_frontend/main.py --model-repository tests/tensorrtllm_models/ --tokenizer meta-llama/Meta-Llama-3.1-8B-Instruct ``` diff --git a/python/openai/docker/Dockerfile.tensorrtllm b/python/openai/docker/Dockerfile.tensorrtllm index 7bdce7499e..ef3bcca69c 100644 --- a/python/openai/docker/Dockerfile.tensorrtllm +++ b/python/openai/docker/Dockerfile.tensorrtllm @@ -24,7 +24,7 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 +ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.08-trtllm-python-py3 FROM ${BASE_IMAGE} RUN pip install /opt/tritonserver/python/*.whl diff --git a/python/openai/docker/Dockerfile.vllm b/python/openai/docker/Dockerfile.vllm index 8a04d63484..e01ec052dd 100644 --- a/python/openai/docker/Dockerfile.vllm +++ b/python/openai/docker/Dockerfile.vllm @@ -24,17 +24,14 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.07-vllm-python-py3 +ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.08-vllm-python-py3 FROM ${BASE_IMAGE} RUN pip install /opt/tritonserver/python/*.whl # TODO: Update along with other folder/structure changes in review comments WORKDIR /workspace -RUN git clone --single-branch -b rmccormick-openai https://github.com/triton-inference-server/server.git && \ +RUN git clone --single-branch -b rmccormick-openai-interface https://github.com/triton-inference-server/server.git && \ pip install -r server/python/openai/docker/requirements.txt && \ - pip install -r server/python/openai/docker/requirements_vllm.txt && \ - mv server/python/openai/openai_frontend ./openai && \ - mv server/python/openai/examples ./openai && \ - mv server/python/openai/README.md ./openai && \ + mv server/python/openai/ . && \ rm -r server diff --git a/python/openai/docker/requirements_vllm.txt b/python/openai/docker/requirements_vllm.txt deleted file mode 100644 index 32bb38b789..0000000000 --- a/python/openai/docker/requirements_vllm.txt +++ /dev/null @@ -1,3 +0,0 @@ -transformers==4.43.1 -# Llama3.1 vllm requirements -vllm==0.5.3.post1 diff --git a/python/openai/openai_frontend/app.py b/python/openai/openai_frontend/app.py deleted file mode 100644 index ec2aa49f11..0000000000 --- a/python/openai/openai_frontend/app.py +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from __future__ import annotations - -from contextlib import asynccontextmanager - -import tritonserver -from fastapi import FastAPI -from fastapi.middleware.cors import CORSMiddleware -from routers import chat_completions, completions, models, observability -from utils.triton import init_tritonserver - - -def add_cors_middleware(app: FastAPI): - # Allow API calls through browser /docs route for debug purposes - origins = [ - "http://localhost", - ] - - # TODO: Move towards logger instead of printing - print(f"[WARNING] Adding CORS for the following origins: {origins}") - app.add_middleware( - CORSMiddleware, - allow_origins=origins, - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], - ) - - -@asynccontextmanager -async def lifespan(app: FastAPI): - # Start the tritonserver on FastAPI app startup - print("Starting FastAPI app lifespan...") - server, model_metadatas = init_tritonserver() - - # NOTE: These are meant for read-only access by routes handling requests - # with a single process, and should generally not be modified for the - # lifetime of the application. If multiple uvicorn workers are instantiated, - # then multiple triton servers would be started, one per worker process. - app.server = server - app.models = {metadata.name: metadata for metadata in model_metadatas} - - yield - - # Cleanup the tritonserver on FastAPI app shutdown - print("Shutting down FastAPI app lifespan...") - if app.server: - print("Shutting down Triton Inference Server...") - try: - app.server.stop() - # Log error, but don't raise on shutdown - except tritonserver.InternalError as e: - print(e) - - -def init_app(): - app = FastAPI( - title="OpenAI API", - description="The OpenAI REST API. Please see https://platform.openai.com/docs/api-reference for more details.", - version="2.0.0", - termsOfService="https://openai.com/policies/terms-of-use", - contact={"name": "OpenAI Support", "url": "https://help.openai.com/"}, - license={ - "name": "MIT", - "url": "https://github.com/openai/openai-openapi/blob/master/LICENSE", - }, - lifespan=lifespan, - ) - - app.include_router(observability.router) - app.include_router(models.router) - app.include_router(completions.router) - app.include_router(chat_completions.router) - - # NOTE: For debugging purposes, should generally be restricted or removed - add_cors_middleware(app) - - return app diff --git a/python/openai/openai_frontend/engine/__init__.py b/python/openai/openai_frontend/engine/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/openai/openai_frontend/engine/engine.py b/python/openai/openai_frontend/engine/engine.py new file mode 100644 index 0000000000..9c90dec25e --- /dev/null +++ b/python/openai/openai_frontend/engine/engine.py @@ -0,0 +1,94 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +from __future__ import annotations + +from typing import Iterator, List, Protocol + +from schemas.openai import ( + CreateChatCompletionRequest, + CreateChatCompletionResponse, + CreateCompletionRequest, + CreateCompletionResponse, + Model, +) + + +class LLMEngine(Protocol): + """ + Interface for an OpenAI-aware inference engine to be attached to an + OpenAI-compatible frontend. + + NOTE: This interface is subject to change, and may land on something more + generic rather than the current 1:1 with OpenAI endpoints over time. + """ + + def ready(self) -> bool: + """ + Returns True if the engine is ready to accept inference requests, or False otherwise. + """ + pass + + def metrics(self) -> str: + """ + Returns the engine's metrics in a Prometheus-compatible string format. + """ + pass + + def models(self) -> List[Model]: + """ + Returns a List of OpenAI Model objects. + """ + pass + + def chat( + self, request: CreateChatCompletionRequest + ) -> CreateChatCompletionResponse | Iterator[str]: + """ + If request.stream is True, this returns an Iterator (or Generator) that + produces server-sent-event (SSE) strings in the following form: + 'data: {CreateChatCompletionStreamResponse}\n\n' + ... + 'data: [DONE]\n\n' + + If request.stream is False, this returns a CreateChatCompletionResponse. + """ + pass + + def completion( + self, request: CreateCompletionRequest + ) -> CreateCompletionResponse | Iterator[str]: + """ + If request.stream is True, this returns an Iterator (or Generator) that + produces server-sent-event (SSE) strings in the following form: + 'data: {CreateCompletionResponse}\n\n' + ... + 'data: [DONE]\n\n' + + If request.stream is False, this returns a CreateCompletionResponse. + """ + pass diff --git a/python/openai/openai_frontend/engine/triton_engine.py b/python/openai/openai_frontend/engine/triton_engine.py new file mode 100644 index 0000000000..3081bc71d3 --- /dev/null +++ b/python/openai/openai_frontend/engine/triton_engine.py @@ -0,0 +1,413 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +from __future__ import annotations + +import time +import uuid +from dataclasses import dataclass +from typing import Any, Callable, Dict, Iterator, List, Optional + +import tritonserver +from engine.engine import LLMEngine +from schemas.openai import ( + ChatCompletionChoice, + ChatCompletionFinishReason, + ChatCompletionResponseMessage, + ChatCompletionStreamingResponseChoice, + ChatCompletionStreamResponseDelta, + Choice, + CreateChatCompletionRequest, + CreateChatCompletionResponse, + CreateChatCompletionStreamResponse, + CreateCompletionRequest, + CreateCompletionResponse, + FinishReason, + Model, + ObjectType, +) +from utils.tokenizer import get_tokenizer +from utils.triton import ( + _create_trtllm_inference_request, + _create_vllm_inference_request, + _get_output, + _validate_triton_responses_non_streaming, +) + + +# TODO: Improve type hints +@dataclass +class TritonModelMetadata: + # Name used in Triton model repository + name: str + # Name of backend used by Triton + backend: str + # Triton model object handle + model: tritonserver.Model + # Tokenizers used for chat templates + tokenizer: Optional[Any] + # Time that model was loaded by Triton + create_time: int + # Conversion format between OpenAI and Triton requests + request_converter: Callable + + +class TritonLLMEngine(LLMEngine): + def __init__(self, server: tritonserver.Server, tokenizer: str): + # Assume an already configured and started server + self.server = server + self.tokenizer = self._get_tokenizer(tokenizer) + + # NOTE: Creation time and model metadata will be static at startup for + # now, and won't account for dynamically loading/unloading models. + self.create_time = int(time.time()) + self.model_metadata = self._get_model_metadata() + + def ready(self) -> bool: + return self.server.ready() + + def metrics(self) -> str: + return self.server.metrics() + + def models(self) -> List[Model]: + models = [] + for metadata in self.model_metadata.values(): + models.append( + Model( + id=metadata.name, + created=metadata.create_time, + object=ObjectType.model, + owned_by="Triton Inference Server", + ), + ) + + return models + + def chat( + self, request: CreateChatCompletionRequest + ) -> CreateChatCompletionResponse | Iterator[str]: + metadata = self.model_metadata.get(request.model) + self._validate_chat_request(request, metadata) + + conversation = [ + {"role": str(message.role), "content": str(message.content)} + for message in request.messages + ] + add_generation_prompt = True + + prompt = metadata.tokenizer.apply_chat_template( + conversation=conversation, + tokenize=False, + add_generation_prompt=add_generation_prompt, + ) + + # Convert to Triton request format and perform inference + responses = metadata.model.infer( + metadata.request_converter(metadata.model, prompt, request) + ) + + # Prepare and send responses back to client in OpenAI format + request_id = f"cmpl-{uuid.uuid1()}" + created = int(time.time()) + default_role = "assistant" + role = self._get_first_response_role( + conversation, add_generation_prompt, default_role + ) + + if request.stream: + return self._streaming_chat_iterator( + request_id, created, request.model, role, responses + ) + + # Response validation with decoupled models in mind + responses = list(responses) + _validate_triton_responses_non_streaming(responses) + response = responses[0] + text = _get_output(response) + + return CreateChatCompletionResponse( + id=request_id, + choices=[ + ChatCompletionChoice( + index=0, + message=ChatCompletionResponseMessage( + content=text, role=role, function_call=None + ), + logprobs=None, + finish_reason=ChatCompletionFinishReason.stop, + ) + ], + created=created, + model=request.model, + system_fingerprint=None, + object=ObjectType.chat_completion, + ) + + def completion( + self, request: CreateCompletionRequest + ) -> CreateCompletionResponse | Iterator[str]: + # Validate request and convert to Triton format + metadata = self.model_metadata.get(request.model) + self._validate_completion_request(request, metadata) + + # Convert to Triton request format and perform inference + responses = metadata.model.infer( + metadata.request_converter(metadata.model, request.prompt, request) + ) + + # Prepare and send responses back to client in OpenAI format + request_id = f"cmpl-{uuid.uuid1()}" + created = int(time.time()) + if request.stream: + return self._streaming_completion_iterator( + request_id, created, metadata.name, responses + ) + + # Response validation with decoupled models in mind + responses = list(responses) + _validate_triton_responses_non_streaming(responses) + response = responses[0] + text = _get_output(response) + + choice = Choice( + finish_reason=FinishReason.stop, + index=0, + logprobs=None, + text=text, + ) + return CreateCompletionResponse( + id=request_id, + choices=[choice], + system_fingerprint=None, + object=ObjectType.text_completion, + created=created, + model=metadata.name, + ) + + # TODO: This behavior should be tested further + def _get_first_response_role( + self, conversation: List[Dict], add_generation_prompt: bool, default_role: str + ) -> str: + if add_generation_prompt: + return default_role + + return conversation[-1]["role"] + + # TODO: Expose explicit flag to catch edge cases + def _determine_request_converter(self, backend): + # Request conversion from OpenAI format to backend-specific format + if backend == "vllm": + return _create_vllm_inference_request + + # Use TRT-LLM format as default for everything else. This could be + # an ensemble, a python or BLS model, a TRT-LLM backend model, etc. + return _create_trtllm_inference_request + + def _get_tokenizer(self, tokenizer_name: str): + # TODO: Consider support for custom tokenizers + tokenizer = None + if tokenizer_name: + tokenizer = get_tokenizer(tokenizer_name) + + return tokenizer + + def _get_model_metadata(self) -> Dict[str, TritonModelMetadata]: + # One tokenizer and creation time shared for all loaded models for now. + model_metadata = {} + + # Read all triton models and store the necessary metadata for each + for name, _ in self.server.models().keys(): + model = self.server.model(name) + backend = model.config()["backend"] + print(f"Found model: {name=}, {backend=}") + + metadata = TritonModelMetadata( + name=name, + backend=backend, + model=model, + tokenizer=self.tokenizer, + create_time=self.create_time, + request_converter=self._determine_request_converter(backend), + ) + model_metadata[name] = metadata + + return model_metadata + + def _get_streaming_chat_response_chunk( + self, + choice: ChatCompletionStreamingResponseChoice, + request_id: str, + created: int, + model: str, + ) -> CreateChatCompletionStreamResponse: + return CreateChatCompletionStreamResponse( + id=request_id, + choices=[choice], + created=created, + model=model, + system_fingerprint=None, + object=ObjectType.chat_completion_chunk, + ) + + def _get_first_streaming_chat_response( + self, request_id: str, created: int, model: str, role: str + ) -> CreateChatCompletionStreamResponse: + # First chunk has no content and sets the role + choice = ChatCompletionStreamingResponseChoice( + index=0, + delta=ChatCompletionStreamResponseDelta( + role=role, content="", function_call=None + ), + logprobs=None, + finish_reason=None, + ) + chunk = self._get_streaming_chat_response_chunk( + choice, request_id, created, model + ) + return chunk + + def _get_nth_streaming_chat_response( + self, + request_id: str, + created: int, + model: str, + response: tritonserver.InferenceResponse, + ) -> CreateChatCompletionStreamResponse: + text = _get_output(response) + choice = ChatCompletionStreamingResponseChoice( + index=0, + delta=ChatCompletionStreamResponseDelta( + role=None, content=text, function_call=None + ), + logprobs=None, + finish_reason=ChatCompletionFinishReason.stop if response.final else None, + ) + + chunk = self._get_streaming_chat_response_chunk( + choice, request_id, created, model + ) + return chunk + + def _streaming_chat_iterator( + self, request_id: str, created: int, model: str, role: str, responses: List + ) -> Iterator[str]: + chunk = self._get_first_streaming_chat_response( + request_id, created, model, role + ) + yield f"data: {chunk.json(exclude_unset=True)}\n\n" + + for response in responses: + chunk = self._get_nth_streaming_chat_response( + request_id, created, model, response + ) + yield f"data: {chunk.json(exclude_unset=True)}\n\n" + + yield "data: [DONE]\n\n" + + def _validate_chat_request( + self, request: CreateChatCompletionRequest, metadata: TritonModelMetadata + ): + """ + Validates a chat request to align with currently supported features. + """ + + # Reject missing internal information needed to do inference + if not metadata: + raise Exception(f"Unknown model: {request.model}") + + if not metadata.tokenizer: + raise Exception("Unknown tokenizer") + + if not metadata.backend: + raise Exception("Unknown backend") + + if not metadata.request_converter: + raise Exception(f"Unknown request format for model: {request.model}") + + # Reject unsupported features if requested + if request.n and request.n > 1: + raise Exception("Only single choice is supported") + + if request.logit_bias is not None or request.logprobs: + raise Exception("logit bias and log probs not currently supported") + + def _streaming_completion_iterator( + self, request_id: str, created: int, model: str, responses: List + ) -> Iterator[str]: + for response in responses: + text = _get_output(response) + choice = Choice( + finish_reason=FinishReason.stop if response.final else None, + index=0, + logprobs=None, + text=text, + ) + response = CreateCompletionResponse( + id=request_id, + choices=[choice], + system_fingerprint=None, + object=ObjectType.text_completion, + created=created, + model=model, + ) + + yield f"data: {response.json(exclude_unset=True)}\n\n" + + yield "data: [DONE]\n\n" + + def _validate_completion_request( + self, request: CreateCompletionRequest, metadata: TritonModelMetadata + ): + """ + Validates a completions request to align with currently supported features. + """ + # Reject missing internal information needed to do inference + if not metadata: + raise Exception(f"Unknown model: {request.model}") + + if not metadata.backend: + raise Exception("Unknown backend") + + if not metadata.request_converter: + raise Exception(f"Unknown request format for model: {request.model}") + + # Reject unsupported features if requested + if request.suffix is not None: + raise Exception("suffix is not currently supported") + + if not request.prompt: + raise Exception("prompt must be non-empty") + + # Currently only support single string as input + if not isinstance(request.prompt, str): + raise Exception("only single string input is supported") + + if request.n and request.n > 1: + raise Exception("Only single choice is supported") + + if request.logit_bias is not None or request.logprobs is not None: + raise Exception("logit bias and log probs not supported") diff --git a/python/openai/openai_frontend/frontend/__init__.py b/python/openai/openai_frontend/frontend/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/openai/openai_frontend/frontend/fastapi/__init__py b/python/openai/openai_frontend/frontend/fastapi/__init__py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/openai/openai_frontend/routers/__init__.py b/python/openai/openai_frontend/frontend/fastapi/routers/__init__.py similarity index 100% rename from python/openai/openai_frontend/routers/__init__.py rename to python/openai/openai_frontend/frontend/fastapi/routers/__init__.py diff --git a/python/openai/openai_frontend/frontend/fastapi/routers/chat.py b/python/openai/openai_frontend/frontend/fastapi/routers/chat.py new file mode 100644 index 0000000000..b4985fcc92 --- /dev/null +++ b/python/openai/openai_frontend/frontend/fastapi/routers/chat.py @@ -0,0 +1,53 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from fastapi import APIRouter, HTTPException, Request +from fastapi.responses import StreamingResponse +from schemas.openai import CreateChatCompletionRequest, CreateChatCompletionResponse + +router = APIRouter() + + +@router.post( + "/v1/chat/completions", response_model=CreateChatCompletionResponse, tags=["Chat"] +) +def create_chat_completion( + request: CreateChatCompletionRequest, + raw_request: Request, +) -> CreateChatCompletionResponse | StreamingResponse: + """ + Creates a chat completion for the provided messages and parameters. + """ + if not raw_request.app.engine: + raise HTTPException(status_code=500, detail="No attached inference engine") + + try: + response = raw_request.app.engine.chat(request) + if request.stream: + return StreamingResponse(response, media_type="text/event-stream") + return response + except Exception as e: + raise HTTPException(status_code=400, detail=f"{e}") diff --git a/python/openai/openai_frontend/frontend/fastapi/routers/completions.py b/python/openai/openai_frontend/frontend/fastapi/routers/completions.py new file mode 100644 index 0000000000..24f54eb7c8 --- /dev/null +++ b/python/openai/openai_frontend/frontend/fastapi/routers/completions.py @@ -0,0 +1,52 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from fastapi import APIRouter, HTTPException, Request +from fastapi.responses import StreamingResponse +from schemas.openai import CreateCompletionRequest, CreateCompletionResponse + +router = APIRouter() + + +@router.post( + "/v1/completions", response_model=CreateCompletionResponse, tags=["Completions"] +) +def create_completion( + request: CreateCompletionRequest, raw_request: Request +) -> CreateCompletionResponse | StreamingResponse: + """ + Creates a completion for the provided prompt and parameters. + """ + if not raw_request.app.engine: + raise HTTPException(status_code=500, detail="No attached inference engine") + + try: + response = raw_request.app.engine.completion(request) + if request.stream: + return StreamingResponse(response, media_type="text/event-stream") + return response + except Exception as e: + raise HTTPException(status_code=400, detail=f"{e}") diff --git a/python/openai/openai_frontend/routers/models.py b/python/openai/openai_frontend/frontend/fastapi/routers/models.py similarity index 68% rename from python/openai/openai_frontend/routers/models.py rename to python/openai/openai_frontend/frontend/fastapi/routers/models.py index 07c37549c7..ac2fa7fdc0 100644 --- a/python/openai/openai_frontend/routers/models.py +++ b/python/openai/openai_frontend/frontend/fastapi/routers/models.py @@ -24,6 +24,8 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from typing import List + from fastapi import APIRouter, HTTPException, Request from schemas.openai import ListModelsResponse, Model, ObjectType @@ -37,28 +39,11 @@ def list_models(request: Request) -> ListModelsResponse: """ Lists the currently available models, and provides basic information about each one such as the owner and availability. """ - model_metadata = request.app.models - if not model_metadata: - raise HTTPException(status_code=400, detail="No known models") - - model_list = [] - for model in model_metadata: - metadata = model_metadata[model] - if not metadata: - raise HTTPException( - status_code=400, detail=f"No metadata for model: {model}" - ) - - model_list.append( - Model( - id=metadata.name, - created=metadata.create_time, - object=ObjectType.model, - owned_by=OWNED_BY, - ), - ) + if not request.app.engine: + raise HTTPException(status_code=500, detail="No attached inference engine") - return ListModelsResponse(object=ObjectType.list, data=model_list) + models: List[Model] = request.app.engine.models() + return ListModelsResponse(object=ObjectType.list, data=models) @router.get("/v1/models/{model_name}", response_model=Model, tags=["Models"]) @@ -66,20 +51,13 @@ def retrieve_model(request: Request, model_name: str) -> Model: """ Retrieves a model instance, providing basic information about the model such as the owner and permissioning. """ - model_metadata = request.app.models - if not model_metadata: - raise HTTPException(status_code=400, detail="No known models") - - model = model_metadata.get(model_name) - if not model: - raise HTTPException(status_code=400, detail=f"Unknown model: {model_name}") - - if model_name == model.name: - return Model( - id=model.name, - created=model.create_time, - object=ObjectType.model, - owned_by=OWNED_BY, - ) + if not request.app.engine: + raise HTTPException(status_code=500, detail="No attached inference engine") + + # TODO: Return model directly from engine instead of searching models + models: List[Model] = request.app.engine.models() + for model in models: + if model.id == model_name: + return model raise HTTPException(status_code=404, detail=f"Unknown model: {model_name}") diff --git a/python/openai/openai_frontend/routers/observability.py b/python/openai/openai_frontend/frontend/fastapi/routers/observability.py similarity index 74% rename from python/openai/openai_frontend/routers/observability.py rename to python/openai/openai_frontend/frontend/fastapi/routers/observability.py index 9f18f9934c..b8040f56b7 100644 --- a/python/openai/openai_frontend/routers/observability.py +++ b/python/openai/openai_frontend/frontend/fastapi/routers/observability.py @@ -25,26 +25,25 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from fastapi import APIRouter, HTTPException, Request -from fastapi.responses import Response +from fastapi.responses import PlainTextResponse, Response router = APIRouter() -@router.get("/metrics", tags=["Utilities"]) -def metrics(request: Request) -> str: - if not request.app.server or not request.app.server.live(): - raise HTTPException( - status_code=400, detail="Triton Inference Server is not live." - ) +@router.get("/metrics", response_class=PlainTextResponse, tags=["Utilities"]) +def metrics(request: Request) -> PlainTextResponse: + return request.app.engine.metrics() - return request.app.server.metrics() +@router.get("/health/ready", tags=["Utilities"]) +def ready(request: Request) -> Response: + if not request.app.engine: + raise HTTPException(status_code=500, detail="No attached inference engine") -@router.get("/health", tags=["Utilities"]) -def health(request: Request) -> Response: - if not request.app.server or not request.app.server.live(): + if not request.app.engine.ready(): raise HTTPException( - status_code=400, detail="Triton Inference Server is not live." + status_code=400, + detail="Attached inference engine is not ready for inference requests.", ) return Response(status_code=200) diff --git a/python/openai/openai_frontend/frontend/fastapi_frontend.py b/python/openai/openai_frontend/frontend/fastapi_frontend.py new file mode 100644 index 0000000000..fd3dc888ed --- /dev/null +++ b/python/openai/openai_frontend/frontend/fastapi_frontend.py @@ -0,0 +1,107 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import annotations + +import uvicorn +from engine.triton_engine import TritonLLMEngine +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +from frontend.fastapi.routers import chat, completions, models, observability +from frontend.frontend import OpenAIFrontend + + +class FastApiFrontend(OpenAIFrontend): + def __init__( + self, + engine: TritonLLMEngine, + host: str = "localhost", + port: int = 8000, + log_level: str = "info", + ): + self.host: str = host + self.port: int = port + self.log_level: str = log_level + self.stopped: bool = False + + self.app = self._create_app() + # Attach the inference engine to the FastAPI app + self.app.engine = engine + + def __del__(self): + self.stop() + + def start(self): + uvicorn.run( + self.app, + host=self.host, + port=self.port, + log_level=self.log_level, + timeout_keep_alive=5, + ) + + def stop(self): + # NOTE: If the frontend owned the engine, it could do cleanup here. + pass + + def _create_app(self): + app = FastAPI( + title="OpenAI API", + description="The OpenAI REST API. Please see https://platform.openai.com/docs/api-reference for more details.", + version="2.0.0", + termsOfService="https://openai.com/policies/terms-of-use", + contact={"name": "OpenAI Support", "url": "https://help.openai.com/"}, + license={ + "name": "MIT", + "url": "https://github.com/openai/openai-openapi/blob/master/LICENSE", + }, + ) + + app.include_router(observability.router) + app.include_router(models.router) + app.include_router(completions.router) + app.include_router(chat.router) + + # NOTE: For debugging purposes, should generally be restricted or removed + self._add_cors_middleware(app) + + return app + + def _add_cors_middleware(self, app: FastAPI): + # Allow API calls through browser /docs route for debug purposes + origins = [ + "http://localhost", + ] + + # TODO: Move towards logger instead of printing + print(f"[WARNING] Adding CORS for the following origins: {origins}") + app.add_middleware( + CORSMiddleware, + allow_origins=origins, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) diff --git a/python/openai/openai_frontend/frontend/frontend.py b/python/openai/openai_frontend/frontend/frontend.py new file mode 100644 index 0000000000..2e311a9aac --- /dev/null +++ b/python/openai/openai_frontend/frontend/frontend.py @@ -0,0 +1,43 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import annotations + +from typing import Protocol + + +class OpenAIFrontend(Protocol): + def start(self) -> None: + """ + Starts the OpenAI-compatible service. + """ + pass + + def stop(self) -> None: + """ + Stops the OpenAI-compatible service. + """ + pass diff --git a/python/openai/openai_frontend/main.py b/python/openai/openai_frontend/main.py index a95d59916f..aafc06624b 100755 --- a/python/openai/openai_frontend/main.py +++ b/python/openai/openai_frontend/main.py @@ -28,9 +28,22 @@ import argparse import os +import signal +from functools import partial -import uvicorn -from app import init_app +import tritonserver +from engine.triton_engine import TritonLLMEngine +from frontend.fastapi_frontend import FastApiFrontend + + +def signal_handler(server, frontend, signal, frame): + print(f"Received {signal=}, {frame=}") + + # Graceful Shutdown + print("Shutting down OpenAI Frontend...") + frontend.stop() + print("Shutting down Triton Inference Server...") + server.stop() def parse_args(): @@ -52,7 +65,7 @@ def parse_args(): # Triton triton_group = parser.add_argument_group("Triton Inference Server") triton_group.add_argument( - "--tritonserver-log-level", + "--tritonserver-log-verbose-level", type=int, default=0, help="The tritonserver log verbosity level", @@ -73,21 +86,33 @@ def parse_args(): return parser.parse_args() -if __name__ == "__main__": +def main(): args = parse_args() - # NOTE: configurations can be passed to FastAPI app through a builder - # function like init_app in future, but using env vars for simplicity. - os.environ["TRITON_MODEL_REPOSITORY"] = args.model_repository - if args.tokenizer: - os.environ["TOKENIZER"] = args.tokenizer - - os.environ["TRITON_LOG_VERBOSE_LEVEL"] = str(args.tritonserver_log_level) - - app = init_app() - uvicorn.run( - app, - host=args.host, - port=args.port, - log_level=args.uvicorn_log_level, - timeout_keep_alive=5, + + # Initialize a Triton Inference Server pointing at LLM models + server: tritonserver.Server = tritonserver.Server( + model_repository=args.model_repository, + log_verbose=args.tritonserver_log_verbose_level, + log_info=True, + log_warn=True, + log_error=True, + ).start(wait_until_ready=True) + + # Wrap Triton Inference Server in an interface-conforming "LLMEngine" + engine: TritonLLMEngine = TritonLLMEngine(server=server, tokenizer=args.tokenizer) + + # Attach TritonLLMEngine as the backbone for inference and model management + frontend: FastApiFrontend = FastApiFrontend( + engine=engine, host=args.host, port=args.port, log_level=args.uvicorn_log_level ) + + # Gracefully shutdown when receiving signals for testing and interactive use + signal.signal(signal.SIGINT, partial(signal_handler, server, frontend)) + signal.signal(signal.SIGTERM, partial(signal_handler, server, frontend)) + + # Blocking call until killed or interrupted with SIGINT + frontend.start() + + +if __name__ == "__main__": + main() diff --git a/python/openai/openai_frontend/routers/chat_completions.py b/python/openai/openai_frontend/routers/chat_completions.py deleted file mode 100644 index aa608f72be..0000000000 --- a/python/openai/openai_frontend/routers/chat_completions.py +++ /dev/null @@ -1,209 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import time -import uuid -from typing import Dict, List - -from fastapi import APIRouter, HTTPException, Request -from fastapi.responses import StreamingResponse -from schemas.openai import ( - ChatCompletionChoice, - ChatCompletionFinishReason, - ChatCompletionResponseMessage, - ChatCompletionStreamingResponseChoice, - ChatCompletionStreamResponseDelta, - CreateChatCompletionRequest, - CreateChatCompletionResponse, - CreateChatCompletionStreamResponse, - ObjectType, -) -from utils.triton import TritonModelMetadata, get_output, validate_triton_responses - -router = APIRouter() - - -# TODO: This behavior should be tested further -def _get_first_response_role( - conversation: List[Dict], add_generation_prompt: bool, default_role: str -) -> str: - if add_generation_prompt: - return default_role - - return conversation[-1]["role"] - - -def _streaming_chat_completion_response( - request_id: str, created: int, model: str, role: str, responses: List -) -> str: - # first chunk - choice = ChatCompletionStreamingResponseChoice( - index=0, - delta=ChatCompletionStreamResponseDelta( - role=role, content="", function_call=None - ), - logprobs=None, - finish_reason=None, - ) - chunk = CreateChatCompletionStreamResponse( - id=request_id, - choices=[choice], - created=created, - model=model, - system_fingerprint=None, - object=ObjectType.chat_completion_chunk, - ) - yield f"data: {chunk.json(exclude_unset=True)}\n\n" - - for response in responses: - text = get_output(response) - - choice = ChatCompletionStreamingResponseChoice( - index=0, - delta=ChatCompletionStreamResponseDelta( - role=None, content=text, function_call=None - ), - logprobs=None, - finish_reason=ChatCompletionFinishReason.stop if response.final else None, - ) - - chunk = CreateChatCompletionStreamResponse( - id=request_id, - choices=[choice], - created=created, - model=model, - system_fingerprint=None, - object=ObjectType.chat_completion_chunk, - ) - - yield f"data: {chunk.json(exclude_unset=True)}\n\n" - - yield "data: [DONE]\n\n" - - -def _validate_chat_request( - request: CreateChatCompletionRequest, metadata: TritonModelMetadata -): - """ - Validates a chat completions request to align with currently supported features. - """ - - if not metadata: - raise HTTPException(status_code=400, detail=f"Unknown model: {request.model}") - - if not metadata.request_converter: - raise HTTPException( - status_code=400, detail=f"Unknown request format for model: {request.model}" - ) - - if not metadata.tokenizer: - raise HTTPException(status_code=400, detail="Unknown tokenizer") - - if not metadata.backend: - raise HTTPException(status_code=400, detail="Unknown backend") - - if request.n and request.n > 1: - raise HTTPException(status_code=400, detail="Only single choice is supported") - - if request.logit_bias is not None or request.logprobs: - raise HTTPException( - status_code=400, detail="logit bias and log probs not supported" - ) - - -@router.post( - "/v1/chat/completions", response_model=CreateChatCompletionResponse, tags=["Chat"] -) -def create_chat_completion( - request: CreateChatCompletionRequest, - raw_request: Request, -) -> CreateChatCompletionResponse | StreamingResponse: - """ - Creates a model response for the given chat conversation. - """ - - metadata = raw_request.app.models.get(request.model) - _validate_chat_request(request, metadata) - - # TODO: Move conversation/role bits into helper - - # Prepare prompt with chat template - # TODO: Does this need to be exposed to the user? - add_generation_prompt = True - conversation = [ - {"role": str(message.role), "content": str(message.content)} - for message in request.messages - ] - - prompt = metadata.tokenizer.apply_chat_template( - conversation=conversation, - tokenize=False, - add_generation_prompt=add_generation_prompt, - ) - - # Convert to Triton request format and perform inference - triton_model = metadata.model - responses = triton_model.infer( - metadata.request_converter(triton_model, prompt, request) - ) - - # Prepare and send responses back to client in OpenAI format - request_id = f"cmpl-{uuid.uuid1()}" - created = int(time.time()) - default_role = "assistant" - role = _get_first_response_role(conversation, add_generation_prompt, default_role) - - if request.stream: - return StreamingResponse( - _streaming_chat_completion_response( - request_id, created, request.model, role, responses - ), - media_type="text/event-stream", - ) - - # Response validation with decoupled models in mind - responses = list(responses) - validate_triton_responses(responses) - response = responses[0] - text = get_output(response) - - return CreateChatCompletionResponse( - id=request_id, - choices=[ - ChatCompletionChoice( - index=0, - message=ChatCompletionResponseMessage( - content=text, role=default_role, function_call=None - ), - logprobs=None, - finish_reason=ChatCompletionFinishReason.stop, - ) - ], - created=created, - model=request.model, - system_fingerprint=None, - object=ObjectType.chat_completion, - ) diff --git a/python/openai/openai_frontend/routers/completions.py b/python/openai/openai_frontend/routers/completions.py deleted file mode 100644 index 3eb0ee07f9..0000000000 --- a/python/openai/openai_frontend/routers/completions.py +++ /dev/null @@ -1,160 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import time -import uuid -from typing import List - -from fastapi import APIRouter, HTTPException, Request -from fastapi.responses import StreamingResponse -from schemas.openai import ( - Choice, - CreateCompletionRequest, - CreateCompletionResponse, - FinishReason, - ObjectType, -) -from utils.triton import TritonModelMetadata, get_output, validate_triton_responses - -router = APIRouter() - - -def _streaming_completion_response( - request_id: str, created: int, model: str, responses: List -) -> str: - for response in responses: - text = get_output(response) - - choice = Choice( - finish_reason=FinishReason.stop if response.final else None, - index=0, - logprobs=None, - text=text, - ) - response = CreateCompletionResponse( - id=request_id, - choices=[choice], - system_fingerprint=None, - object=ObjectType.text_completion, - created=created, - model=model, - ) - - yield f"data: {response.json(exclude_unset=True)}\n\n" - yield "data: [DONE]\n\n" - - -def _validate_completions_request( - request: CreateCompletionRequest, metadata: TritonModelMetadata -): - """ - Validates a completions request to align with currently supported features. - """ - if not metadata: - raise HTTPException( - status_code=400, detail=f"Unknown model metadata for model: {request.model}" - ) - - if not metadata.request_converter: - raise HTTPException( - status_code=400, detail=f"Unknown request format for model: {request.model}" - ) - - if request.suffix is not None: - raise HTTPException(status_code=400, detail="suffix is not currently supported") - - if request.model != metadata.name: - raise HTTPException(status_code=404, detail=f"Unknown model: {request.model}") - - if not request.prompt: - raise HTTPException(status_code=400, detail="prompt must be non-empty") - - # Currently only support single string as input - if not isinstance(request.prompt, str): - raise HTTPException( - status_code=400, detail="only single string input is supported" - ) - - if request.n and request.n > 1: - raise HTTPException(status_code=400, detail="Only single choice is supported") - - if request.logit_bias is not None or request.logprobs is not None: - raise HTTPException( - status_code=400, detail="logit bias and log probs not supported" - ) - - -@router.post( - "/v1/completions", response_model=CreateCompletionResponse, tags=["Completions"] -) -def create_completion( - request: CreateCompletionRequest, raw_request: Request -) -> CreateCompletionResponse | StreamingResponse: - """ - Creates a completion for the provided prompt and parameters. - """ - - # Validate request and convert to Triton format - metadata = raw_request.app.models.get(request.model) - _validate_completions_request(request, metadata) - - # Convert to Triton request format and perform inference - triton_model = raw_request.app.server.model(request.model) - responses = triton_model.infer( - metadata.request_converter(triton_model, request.prompt, request) - ) - - # Prepare and send responses back to client in OpenAI format - request_id = f"cmpl-{uuid.uuid1()}" - created = int(time.time()) - if request.stream: - return StreamingResponse( - _streaming_completion_response( - request_id, created, metadata.name, responses - ), - media_type="text/event-stream", - ) - - # Response validation with decoupled models in mind - responses = list(responses) - validate_triton_responses(responses) - response = responses[0] - text = get_output(response) - - choice = Choice( - finish_reason=FinishReason.stop, - index=0, - logprobs=None, - text=text, - ) - return CreateCompletionResponse( - id=request_id, - choices=[choice], - system_fingerprint=None, - object=ObjectType.text_completion, - created=created, - model=metadata.name, - ) diff --git a/python/openai/openai_frontend/schemas/openai.py b/python/openai/openai_frontend/schemas/openai.py index 194ad681c5..b5e4381f7c 100644 --- a/python/openai/openai_frontend/schemas/openai.py +++ b/python/openai/openai_frontend/schemas/openai.py @@ -61,6 +61,10 @@ class CreateCompletionRequest(BaseModel): 0, description="Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.\n\n[See more information about frequency and presence penalties.](/docs/guides/text-generation/parameter-details)\n", ) + # TODO: Extension, flesh out description and defaults + ignore_eos: Optional[bool] = Field( + False, description="Ignore end-of-sequence tokens during generation\n" + ) logit_bias: Optional[Dict[str, int]] = Field( None, description='Modify the likelihood of specified tokens appearing in the completion.\n\nAccepts a JSON object that maps tokens (specified by their token ID in the GPT tokenizer) to an associated bias value from -100 to 100. You can use this [tokenizer tool](/tokenizer?view=bpe) to convert text to token IDs. Mathematically, the bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model, but values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should result in a ban or exclusive selection of the relevant token.\n\nAs an example, you can pass `{"50256": -100}` to prevent the <|endoftext|> token from being generated.\n', @@ -74,6 +78,11 @@ class CreateCompletionRequest(BaseModel): description="The maximum number of [tokens](/tokenizer) that can be generated in the completion.\n\nThe token count of your prompt plus `max_tokens` cannot exceed the model's context length. [Example Python code](https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken) for counting tokens.\n", examples=[16], ) + # TODO: Extension, flesh out description and defaults + min_tokens: Optional[conint(ge=0)] = Field( + None, + description="The minimum number of [tokens](/tokenizer) that should be generated in the completion.\n", + ) n: Optional[conint(ge=1, le=128)] = Field( 1, description="How many completions to generate for each prompt.\n\n**Note:** Because this parameter generates many completions, it can quickly consume your token quota. Use carefully and ensure that you have reasonable settings for `max_tokens` and `stop`.\n", @@ -781,6 +790,10 @@ class CreateChatCompletionRequest(BaseModel): 0, description="Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.\n\n[See more information about frequency and presence penalties.](/docs/guides/text-generation/parameter-details)\n", ) + # TODO: Extension, flesh out description and defaults + ignore_eos: Optional[bool] = Field( + False, description="Ignore end-of-sequence tokens during generation\n" + ) logit_bias: Optional[Dict[str, int]] = Field( None, description="Modify the likelihood of specified tokens appearing in the completion.\n\nAccepts a JSON object that maps tokens (specified by their token ID in the tokenizer) to an associated bias value from -100 to 100. Mathematically, the bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model, but values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should result in a ban or exclusive selection of the relevant token.\n", @@ -797,6 +810,11 @@ class CreateChatCompletionRequest(BaseModel): 16, description="The maximum number of [tokens](/tokenizer) that can be generated in the chat completion.\n\nThe total length of input tokens and generated tokens is limited by the model's context length. [Example Python code](https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken) for counting tokens.\n", ) + # TODO: Extension, flesh out description and defaults + min_tokens: Optional[conint(ge=0)] = Field( + None, + description="The minimum number of [tokens](/tokenizer) that should be generated in the chat completion.\n", + ) n: Optional[conint(ge=1, le=128)] = Field( 1, description="How many chat completion choices to generate for each input message. Note that you will be charged based on the number of generated tokens across all of the choices. Keep `n` as `1` to minimize costs.", diff --git a/python/openai/openai_frontend/utils/triton.py b/python/openai/openai_frontend/utils/triton.py index 708a5a316a..e7c97e7c53 100644 --- a/python/openai/openai_frontend/utils/triton.py +++ b/python/openai/openai_frontend/utils/triton.py @@ -24,143 +24,14 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import os -import time -from dataclasses import dataclass -from typing import Any, Callable, Optional +from typing import List import numpy as np import tritonserver -from fastapi import HTTPException from schemas.openai import CreateChatCompletionRequest, CreateCompletionRequest -from utils.tokenizer import get_tokenizer -# TODO: Improve type hints -@dataclass -class TritonModelMetadata: - # Name used in Triton model repository - name: str - # Name of backend used by Triton - backend: str - # Triton model object handle - model: tritonserver.Model - # Tokenizers used for chat templates - tokenizer: Optional[Any] - # Time that model was loaded by Triton - create_time: int - # Conversion format between OpenAI and Triton requests - request_converter: Callable - - -# TODO: Expose explicit flag to catch edge cases -def determine_request_converter(backend): - # Request conversion from OpenAI format to backend-specific format - if backend == "vllm": - return create_vllm_inference_request - - # Use TRT-LLM format as default for everything else. This could be - # an ensemble, a python or BLS model, a TRT-LLM backend model, etc. - return create_trtllm_inference_request - - -def load_models(server): - model_metadata = [] - backends = [] - - # TODO: Consider support for custom tokenizers - tokenizer = None - tokenizer_model = os.environ.get("TOKENIZER") - if tokenizer_model: - print(f"Using env var TOKENIZER={tokenizer_model} to determine the tokenizer") - tokenizer = get_tokenizer(tokenizer_model) - - models = [] - backends = [] - names = [] - # Load all triton models and gather the respective backends of each - for name, version in server.models().keys(): - # Skip models that are already loaded, if any - if version != -1: - continue - - model = server.load(name) - backend = model.config()["backend"] - - names.append(name) - models.append(model) - backends.append(backend) - print(f"Loaded: {name=}, {backend=}, tokenizer={tokenizer_model}") - - create_time = int(time.time()) - - # One tokenizer, convert function, and creation time for all loaded models. - # NOTE: This doesn't currently support having both a vLLM and TRT-LLM - # model loaded at the same time. - for name, model, backend in zip(names, models, backends): - metadata = TritonModelMetadata( - name=name, - backend=backend, - model=model, - tokenizer=tokenizer, - create_time=create_time, - request_converter=determine_request_converter(backend), - ) - model_metadata.append(metadata) - - return model_metadata - - -def init_tritonserver(): - model_repository = os.environ.get( - "TRITON_MODEL_REPOSITORY", "/opt/tritonserver/models" - ) - log_verbose_level = int(os.environ.get("TRITON_LOG_VERBOSE_LEVEL", "0")) - - print("Starting Triton Server...") - server = tritonserver.Server( - model_repository=model_repository, - log_verbose=log_verbose_level, - log_info=True, - log_warn=True, - log_error=True, - model_control_mode=tritonserver.ModelControlMode.EXPLICIT, - ).start(wait_until_ready=True) - - print("Loading Models...") - metadatas = load_models(server) - return server, metadatas - - -def get_output(response): - if "text_output" in response.outputs: - try: - return response.outputs["text_output"].to_string_array()[0] - except Exception: - return str(response.outputs["text_output"].to_bytes_array()[0]) - return "" - - -def validate_triton_responses(responses): - num_responses = len(responses) - if num_responses == 1 and responses[0].final != True: - raise HTTPException( - status_code=400, - detail="Unexpected internal error with incorrect response flags", - ) - if num_responses == 2 and responses[-1].final != True: - raise HTTPException( - status_code=400, - detail="Unexpected internal error with incorrect response flags", - ) - if num_responses > 2: - raise HTTPException( - status_code=400, - detail=f"Unexpected number of responses: {num_responses}, expected 1.", - ) - - -def create_vllm_inference_request( +def _create_vllm_inference_request( model, prompt, request: CreateChatCompletionRequest | CreateCompletionRequest ): inputs = {} @@ -182,7 +53,7 @@ def create_vllm_inference_request( return model.create_request(inputs=inputs, parameters=sampling_parameters) -def create_trtllm_inference_request( +def _create_trtllm_inference_request( model, prompt, request: CreateChatCompletionRequest | CreateCompletionRequest ): inputs = {} @@ -206,3 +77,25 @@ def create_trtllm_inference_request( if request.temperature is not None: inputs["temperature"] = np.float32([[request.temperature]]) return model.create_request(inputs=inputs) + + +# TODO: Use tritonserver.InferenceResponse when support is published +def _get_output(response: tritonserver._api._response.InferenceResponse): + if "text_output" in response.outputs: + try: + return response.outputs["text_output"].to_string_array()[0] + except Exception: + return str(response.outputs["text_output"].to_bytes_array()[0]) + return "" + + +def _validate_triton_responses_non_streaming( + responses: List[tritonserver._api._response.InferenceResponse], +): + num_responses = len(responses) + if num_responses == 1 and responses[0].final != True: + raise Exception("Unexpected internal error with incorrect response flags") + if num_responses == 2 and responses[-1].final != True: + raise Exception("Unexpected internal error with incorrect response flags") + if num_responses > 2: + raise Exception(f"Unexpected number of responses: {num_responses}, expected 1.") diff --git a/python/openai/tests/conftest.py b/python/openai/tests/conftest.py index 4952beaef0..dd3755fe83 100644 --- a/python/openai/tests/conftest.py +++ b/python/openai/tests/conftest.py @@ -78,10 +78,14 @@ def server(): @pytest.fixture(scope="class") def fastapi_client_class_scope(): model_repository = str(Path(__file__).parent / f"{TEST_BACKEND}_models") - app = setup_fastapi_app(tokenizer=TEST_TOKENIZER, model_repository=model_repository) + app, server = setup_fastapi_app( + tokenizer=TEST_TOKENIZER, model_repository=model_repository + ) with TestClient(app) as test_client: yield test_client + server.stop() + @pytest.fixture def model(): diff --git a/python/openai/tests/test_chat_completions.py b/python/openai/tests/test_chat_completions.py index cf0b278c38..dce4efeadc 100644 --- a/python/openai/tests/test_chat_completions.py +++ b/python/openai/tests/test_chat_completions.py @@ -95,6 +95,9 @@ def test_chat_completions_system_prompt_only(self, client, model: str): # logprobs is a boolean for chat completions ("logprobs", True), ("logit_bias", {"0": 0}), + # NOTE: Extensions to the spec + ("min_tokens", 16), + ("ignore_eos", True), ], ) def test_chat_completions_sampling_parameters( @@ -113,7 +116,10 @@ def test_chat_completions_sampling_parameters( unsupported_parameters = ["logprobs", "logit_bias"] if param_key in unsupported_parameters: assert response.status_code == 400 - assert response.json()["detail"] == "logit bias and log probs not supported" + assert ( + response.json()["detail"] + == "logit bias and log probs not currently supported" + ) return assert response.status_code == 200 @@ -131,6 +137,9 @@ def test_chat_completions_sampling_parameters( ("frequency_penalty", -3), ("presence_penalty", 2.1), ("presence_penalty", -2.1), + # NOTE: Extensions to the spec + ("min_tokens", -1), + ("ignore_eos", 123), ], ) def test_chat_completions_invalid_sampling_parameters( @@ -462,7 +471,7 @@ def test_chat_completions_no_tokenizer( self, backend: str, model: str, messages: List[dict] ): model_repository = str(Path(__file__).parent / f"{backend}_models") - app = setup_fastapi_app(model_repository=model_repository, tokenizer="") + app, server = setup_fastapi_app(model_repository=model_repository, tokenizer="") with TestClient(app) as client: response = client.post( "/v1/chat/completions", @@ -470,3 +479,5 @@ def test_chat_completions_no_tokenizer( ) assert response.status_code == 400 assert response.json()["detail"] == "Unknown tokenizer" + + server.stop() diff --git a/python/openai/tests/test_completions.py b/python/openai/tests/test_completions.py index 13b7d99686..b6bc38dd58 100644 --- a/python/openai/tests/test_completions.py +++ b/python/openai/tests/test_completions.py @@ -59,6 +59,9 @@ def test_completions_defaults(self, client, model: str, prompt: str): # logprobs is an integer for completions ("logprobs", 5), ("logit_bias", {"0": 0}), + # NOTE: Extensions to the spec + ("min_tokens", 16), + ("ignore_eos", True), ], ) def test_completions_sampling_parameters( @@ -283,6 +286,9 @@ def test_completions_seed(self, client, model: str, prompt: str): ("frequency_penalty", -3), ("presence_penalty", 2.1), ("presence_penalty", -2.1), + # NOTE: Extensions to the spec + ("min_tokens", -1), + ("ignore_eos", 123), ], ) def test_completions_invalid_sampling_parameters( diff --git a/python/openai/tests/test_observability.py b/python/openai/tests/test_observability.py index c2b7ae2e3d..17d25f126d 100644 --- a/python/openai/tests/test_observability.py +++ b/python/openai/tests/test_observability.py @@ -24,7 +24,6 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import os from pathlib import Path import pytest @@ -41,11 +40,16 @@ def model(): class TestObservability: @pytest.fixture(scope="class") def client(self): + # TODO: Cleanup, mock server/engine, etc. model_repository = Path(__file__).parent / "test_models" - app = setup_fastapi_app(tokenizer="", model_repository=str(model_repository)) + app, server = setup_fastapi_app( + tokenizer="", model_repository=str(model_repository) + ) with TestClient(app) as test_client: yield test_client + server.stop() + ### General Error Handling ### def test_not_found(self, client): response = client.get("/does-not-exist") @@ -53,7 +57,7 @@ def test_not_found(self, client): ### Startup / Health ### def test_startup_success(self, client): - response = client.get("/health") + response = client.get("/health/ready") assert response.status_code == 200 ### Metrics ### @@ -61,8 +65,7 @@ def test_startup_metrics(self, client): response = client.get("/metrics") assert response.status_code == 200 # TODO: Flesh out metrics tests further - # NOTE: response.json() works even on non-json prometheus data - assert "nv_cpu_utilization" in response.json() + assert "nv_cpu_utilization" in response.text ### Models ### def test_models_list(self, client): @@ -86,16 +89,3 @@ def test_models_get(self, client, model): assert model_resp["object"] == "model" assert model_resp["created"] > 0 assert model_resp["owned_by"] == "Triton Inference Server" - - -# For tests that won't use the same pytest fixture for server startup across -# the whole class test suite. -class TestObservabilityCustomFixture: - def test_startup_fail(self): - os.environ["TRITON_MODEL_REPOSITORY"] = "/does/not/exist" - with pytest.raises(Exception): - # Test that FastAPI lifespan startup fails when initializing Triton - # with unknown model repository. - app = init_app() - with TestClient(app): - pass diff --git a/python/openai/tests/utils.py b/python/openai/tests/utils.py index d4e94c3991..5d36ef8ef8 100644 --- a/python/openai/tests/utils.py +++ b/python/openai/tests/utils.py @@ -33,16 +33,27 @@ import openai import requests +import tritonserver sys.path.append(os.path.join(Path(__file__).resolve().parent, "..", "openai_frontend")) -from openai_frontend.app import init_app +from engine.triton_engine import TritonLLMEngine +from frontend.fastapi_frontend import FastApiFrontend +# TODO: Cleanup, refactor, mock, etc. def setup_fastapi_app(tokenizer: str, model_repository: str): - os.environ["TOKENIZER"] = tokenizer - os.environ["TRITON_MODEL_REPOSITORY"] = model_repository - app = init_app() - return app + server: tritonserver.Server = tritonserver.Server( + model_repository=model_repository, + log_verbose=0, + log_info=True, + log_warn=True, + log_error=True, + ).start(wait_until_ready=True) + + engine: TritonLLMEngine = TritonLLMEngine(server=server, tokenizer=tokenizer) + + frontend: FastApiFrontend = FastApiFrontend(engine=engine) + return frontend.app, server # Heavily inspired by vLLM's test infrastructure @@ -72,7 +83,9 @@ def __init__( stderr=sys.stderr, ) # Wait until health endpoint is responsive - self._wait_for_server(url=self.url_for("health"), timeout=self.START_TIMEOUT) + self._wait_for_server( + url=self.url_for("health", "ready"), timeout=self.START_TIMEOUT + ) def __enter__(self): return self diff --git a/qa/L0_openai/test.sh b/qa/L0_openai/test.sh index 8d73397f75..d7f71cf1fc 100755 --- a/qa/L0_openai/test.sh +++ b/qa/L0_openai/test.sh @@ -15,12 +15,12 @@ function install_deps() { } function prepare_vllm() { - pip install -r requirements_vllm.txt + echo "No prep needed for vllm currently" } function prepare_tensorrtllm() { MODEL="llama-3-8b-instruct" - MODEL_REPO="../openai/tests/tensorrtllm_models" + MODEL_REPO="../tests/tensorrtllm_models" rm -rf ${MODEL_REPO} # FIXME: This will require an upgrade each release to match the TRT-LLM version @@ -47,12 +47,12 @@ function pre_test() { } function run_test() { - pushd openai/openai/tests + pushd openai/ TEST_LOG="test_openai.log" # Capture error code without exiting to allow log collection set +e - pytest -s -v --junitxml=test_openai.xml 2>&1 > ${TEST_LOG} + pytest -s -v --junitxml=test_openai.xml tests/ 2>&1 > ${TEST_LOG} if [ $? -ne 0 ]; then cat ${TEST_LOG} echo -e "\n***\n*** Test Failed\n***" From ea23eeb730b34863dfa3acf17bcd9b7a3b00e374 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Wed, 28 Aug 2024 19:28:31 -0700 Subject: [PATCH 61/80] Update dockerfile branch, fix CodeQL error --- python/openai/docker/Dockerfile.vllm | 2 +- python/openai/openai_frontend/main.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/python/openai/docker/Dockerfile.vllm b/python/openai/docker/Dockerfile.vllm index e01ec052dd..8ee6c04c6c 100644 --- a/python/openai/docker/Dockerfile.vllm +++ b/python/openai/docker/Dockerfile.vllm @@ -31,7 +31,7 @@ RUN pip install /opt/tritonserver/python/*.whl # TODO: Update along with other folder/structure changes in review comments WORKDIR /workspace -RUN git clone --single-branch -b rmccormick-openai-interface https://github.com/triton-inference-server/server.git && \ +RUN git clone --single-branch -b rmccormick-openai https://github.com/triton-inference-server/server.git && \ pip install -r server/python/openai/docker/requirements.txt && \ mv server/python/openai/ . && \ rm -r server diff --git a/python/openai/openai_frontend/main.py b/python/openai/openai_frontend/main.py index aafc06624b..d22e65c901 100755 --- a/python/openai/openai_frontend/main.py +++ b/python/openai/openai_frontend/main.py @@ -27,7 +27,6 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse -import os import signal from functools import partial From 156535c627d09de42d69de568f34b79f56eb9afa Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Thu, 29 Aug 2024 11:45:39 -0700 Subject: [PATCH 62/80] Add tests for custom tokenizers by local file path --- .gitignore | 1 + python/openai/tests/conftest.py | 20 +++-- python/openai/tests/test_chat_completions.py | 84 +++++++++++++++++--- python/openai/tests/test_observability.py | 7 +- python/openai/tests/utils.py | 8 +- 5 files changed, 95 insertions(+), 25 deletions(-) diff --git a/.gitignore b/.gitignore index 7974ad5fa7..205e95606d 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ artifacts # Test exclusions qa/L0_openai/openai tensorrtllm_models +custom_tokenizer diff --git a/python/openai/tests/conftest.py b/python/openai/tests/conftest.py index dd3755fe83..dfa6b92f65 100644 --- a/python/openai/tests/conftest.py +++ b/python/openai/tests/conftest.py @@ -28,7 +28,7 @@ import pytest from fastapi.testclient import TestClient -from tests.utils import OpenAIServer, setup_fastapi_app +from tests.utils import OpenAIServer, setup_fastapi_app, setup_server ### TEST ENVIRONMENT SETUP ### TEST_BACKEND = "" @@ -78,30 +78,34 @@ def server(): @pytest.fixture(scope="class") def fastapi_client_class_scope(): model_repository = str(Path(__file__).parent / f"{TEST_BACKEND}_models") - app, server = setup_fastapi_app( - tokenizer=TEST_TOKENIZER, model_repository=model_repository - ) + server = setup_server(model_repository=model_repository) + app = setup_fastapi_app(tokenizer=TEST_TOKENIZER, server=server) with TestClient(app) as test_client: yield test_client server.stop() -@pytest.fixture +@pytest.fixture(scope="module") def model(): return TEST_MODEL -@pytest.fixture +@pytest.fixture(scope="module") def backend(): return TEST_BACKEND -@pytest.fixture +@pytest.fixture(scope="module") +def tokenizer_model(): + return TEST_TOKENIZER + + +@pytest.fixture(scope="module") def prompt(): return TEST_PROMPT -@pytest.fixture +@pytest.fixture(scope="module") def messages(): return TEST_MESSAGES diff --git a/python/openai/tests/test_chat_completions.py b/python/openai/tests/test_chat_completions.py index dce4efeadc..d15db4af2e 100644 --- a/python/openai/tests/test_chat_completions.py +++ b/python/openai/tests/test_chat_completions.py @@ -25,12 +25,14 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import copy +import subprocess from pathlib import Path from typing import List import pytest +import tritonserver from fastapi.testclient import TestClient -from tests.utils import setup_fastapi_app +from tests.utils import setup_fastapi_app, setup_server class TestChatCompletions: @@ -463,21 +465,83 @@ def test_usage_response(self): # For tests that won't use the same pytest fixture for server startup across # the whole class test suite. -class TestChatCompletionsCustomFixture: - # A TOKENIZER must be known for /chat/completions endpoint in order to +class TestChatCompletionsTokenizers: + # Re-use a single Triton server for different frontend configurations + @pytest.fixture(scope="class") + def server(self, backend: str): + model_repository = str(Path(__file__).parent / f"{backend}_models") + server = setup_server(str(model_repository)) + yield server + server.stop() + + # A tokenizer must be known for /chat/completions endpoint in order to # apply chat templates, and for simplicity in determination, users should - # define the TOKENIZER. So, explicitly raise an error if none is provided. + # define the tokenizer. So, explicitly raise an error if none is provided. def test_chat_completions_no_tokenizer( - self, backend: str, model: str, messages: List[dict] + self, server: tritonserver.Server, model: str, messages: List[dict] ): - model_repository = str(Path(__file__).parent / f"{backend}_models") - app, server = setup_fastapi_app(model_repository=model_repository, tokenizer="") + app = setup_fastapi_app(tokenizer="", server=server) with TestClient(app) as client: response = client.post( "/v1/chat/completions", json={"model": model, "messages": messages}, ) - assert response.status_code == 400 - assert response.json()["detail"] == "Unknown tokenizer" - server.stop() + assert response.status_code == 400 + assert response.json()["detail"] == "Unknown tokenizer" + + def test_chat_completions_custom_tokenizer( + self, + server: tritonserver.Server, + tokenizer_model: str, + model: str, + messages: List[dict], + ): + # Tokenizers can be provided by a local file path to a directory containing + # the relevant files such as tokenizer.json and tokenizer_config.json. + custom_tokenizer_path = str(Path(__file__).parent / "custom_tokenizer") + download_cmd = f"huggingface-cli download --local-dir {custom_tokenizer_path} {tokenizer_model} --include *.json" + print(f"Running download command: {download_cmd}") + subprocess.run(download_cmd.split(), check=True) + + # Compare the downloaded tokenizer response against remote HF equivalent + # to assert equivalent functionality in responses and chat template. + app_local = setup_fastapi_app(tokenizer=custom_tokenizer_path, server=server) + app_hf = setup_fastapi_app(tokenizer=tokenizer_model, server=server) + + responses = [] + with TestClient(app_local) as client_local, TestClient(app_hf) as client_hf: + payload = {"model": model, "messages": messages, "temperature": 0} + responses.append(client_local.post("/v1/chat/completions", json=payload)) + responses.append(client_hf.post("/v1/chat/completions", json=payload)) + + for response in responses: + assert response.status_code == 200 + message = response.json()["choices"][0]["message"] + assert message["content"].strip() + assert message["role"] == "assistant" + + def equal_dicts(d1, d2, ignore_keys): + d1_filtered = {k: v for k, v in d1.items() if k not in ignore_keys} + d2_filtered = {k: v for k, v in d2.items() if k not in ignore_keys} + return d1_filtered == d2_filtered + + ignore_keys = ["id", "created"] + assert equal_dicts( + responses[0].json(), responses[1].json(), ignore_keys=ignore_keys + ) + + def test_chat_completions_invalid_chat_tokenizer( + self, server: tritonserver.Server, model: str, messages: List[dict] + ): + # Pick a tokenizer with no chat template defined + invalid_chat_tokenizer = "gpt2" + app = setup_fastapi_app(tokenizer=invalid_chat_tokenizer, server=server) + with TestClient(app) as client: + response = client.post( + "/v1/chat/completions", + json={"model": model, "messages": messages}, + ) + + assert response.status_code == 400 + assert "cannot use apply_chat_template()" in response.json()["detail"].lower() diff --git a/python/openai/tests/test_observability.py b/python/openai/tests/test_observability.py index 17d25f126d..7c267bc460 100644 --- a/python/openai/tests/test_observability.py +++ b/python/openai/tests/test_observability.py @@ -28,7 +28,7 @@ import pytest from fastapi.testclient import TestClient -from tests.utils import setup_fastapi_app +from tests.utils import setup_fastapi_app, setup_server # Override conftest.py default model @@ -42,9 +42,8 @@ class TestObservability: def client(self): # TODO: Cleanup, mock server/engine, etc. model_repository = Path(__file__).parent / "test_models" - app, server = setup_fastapi_app( - tokenizer="", model_repository=str(model_repository) - ) + server = setup_server(str(model_repository)) + app = setup_fastapi_app(tokenizer="", server=server) with TestClient(app) as test_client: yield test_client diff --git a/python/openai/tests/utils.py b/python/openai/tests/utils.py index 5d36ef8ef8..d41b064db4 100644 --- a/python/openai/tests/utils.py +++ b/python/openai/tests/utils.py @@ -41,7 +41,7 @@ # TODO: Cleanup, refactor, mock, etc. -def setup_fastapi_app(tokenizer: str, model_repository: str): +def setup_server(model_repository: str): server: tritonserver.Server = tritonserver.Server( model_repository=model_repository, log_verbose=0, @@ -49,11 +49,13 @@ def setup_fastapi_app(tokenizer: str, model_repository: str): log_warn=True, log_error=True, ).start(wait_until_ready=True) + return server - engine: TritonLLMEngine = TritonLLMEngine(server=server, tokenizer=tokenizer) +def setup_fastapi_app(tokenizer: str, server: tritonserver.Server): + engine: TritonLLMEngine = TritonLLMEngine(server=server, tokenizer=tokenizer) frontend: FastApiFrontend = FastApiFrontend(engine=engine) - return frontend.app, server + return frontend.app # Heavily inspired by vLLM's test infrastructure From 9b7dc59435877aaca015d126b3b190017bc62b05 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Fri, 30 Aug 2024 17:13:46 -0700 Subject: [PATCH 63/80] Expose --backend request format override to main.py, and expose env var overrides for backend/model/repository/etc. for flexible testing --- python/openai/examples/test_overrides.sh | 6 ++ .../openai_frontend/engine/triton_engine.py | 12 ++- python/openai/openai_frontend/main.py | 13 ++- python/openai/tests/conftest.py | 82 +++++++++++++------ python/openai/tests/test_chat_completions.py | 31 +++++-- python/openai/tests/test_observability.py | 2 +- python/openai/tests/utils.py | 6 +- 7 files changed, 113 insertions(+), 39 deletions(-) create mode 100755 python/openai/examples/test_overrides.sh diff --git a/python/openai/examples/test_overrides.sh b/python/openai/examples/test_overrides.sh new file mode 100755 index 0000000000..ff36ad2ea5 --- /dev/null +++ b/python/openai/examples/test_overrides.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export TEST_BACKEND="vllm" +export TEST_MODEL=llama-3.1-8b-instruct +export TEST_MODEL_REPOSITORY="${PWD}/tests/vllm_models/" +export TEST_TOKENIZER="meta-llama/Meta-Llama-3.1-8B-Instruct" +python3 -m pytest -s -v tests/ diff --git a/python/openai/openai_frontend/engine/triton_engine.py b/python/openai/openai_frontend/engine/triton_engine.py index 3081bc71d3..0a81d98d5e 100644 --- a/python/openai/openai_frontend/engine/triton_engine.py +++ b/python/openai/openai_frontend/engine/triton_engine.py @@ -77,10 +77,14 @@ class TritonModelMetadata: class TritonLLMEngine(LLMEngine): - def __init__(self, server: tritonserver.Server, tokenizer: str): + def __init__( + self, server: tritonserver.Server, tokenizer: str, backend: Optional[str] = None + ): # Assume an already configured and started server self.server = server self.tokenizer = self._get_tokenizer(tokenizer) + # TODO: Reconsider name of "backend" vs. something like "request_format" + self.backend = backend # NOTE: Creation time and model metadata will be static at startup for # now, and won't account for dynamically loading/unloading models. @@ -218,7 +222,11 @@ def _get_first_response_role( return conversation[-1]["role"] # TODO: Expose explicit flag to catch edge cases - def _determine_request_converter(self, backend): + def _determine_request_converter(self, backend: str): + # Allow manual override of backend request format if provided by user + if self.backend: + backend = self.backend + # Request conversion from OpenAI format to backend-specific format if backend == "vllm": return _create_vllm_inference_request diff --git a/python/openai/openai_frontend/main.py b/python/openai/openai_frontend/main.py index d22e65c901..6f94f81ac5 100755 --- a/python/openai/openai_frontend/main.py +++ b/python/openai/openai_frontend/main.py @@ -79,7 +79,14 @@ def parse_args(): "--tokenizer", type=str, default=None, - help="HuggingFace ID of the Tokenizer to use for chat templates", + help="HuggingFace ID or local folder path of the Tokenizer to use for chat templates", + ) + triton_group.add_argument( + "--backend", + type=str, + default=None, + choices=["vllm", "tensorrtllm"], + help="Manual override of Triton backend request format (inputs/output names) to use for inference", ) return parser.parse_args() @@ -98,7 +105,9 @@ def main(): ).start(wait_until_ready=True) # Wrap Triton Inference Server in an interface-conforming "LLMEngine" - engine: TritonLLMEngine = TritonLLMEngine(server=server, tokenizer=args.tokenizer) + engine: TritonLLMEngine = TritonLLMEngine( + server=server, tokenizer=args.tokenizer, backend=args.backend + ) # Attach TritonLLMEngine as the backbone for inference and model management frontend: FastApiFrontend = FastApiFrontend( diff --git a/python/openai/tests/conftest.py b/python/openai/tests/conftest.py index dfa6b92f65..43ccd27908 100644 --- a/python/openai/tests/conftest.py +++ b/python/openai/tests/conftest.py @@ -24,39 +24,59 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import os from pathlib import Path import pytest from fastapi.testclient import TestClient from tests.utils import OpenAIServer, setup_fastapi_app, setup_server + ### TEST ENVIRONMENT SETUP ### -TEST_BACKEND = "" -TEST_MODEL = "" -TEST_PROMPT = "What is machine learning?" -TEST_MESSAGES = [{"role": "user", "content": TEST_PROMPT}] -TEST_TOKENIZER = "meta-llama/Meta-Llama-3.1-8B-Instruct" +def infer_test_environment(): + # Infer the test environment for simplicity in local dev/testing. + try: + import vllm as _ + + backend = "vllm" + model = "llama-3.1-8b-instruct" + return backend, model + except ImportError: + print("No vllm installation found.") + + try: + import tensorrt_llm as _ + + backend = "tensorrtllm" + model = "tensorrt_llm_bls" + return backend, model + except ImportError: + print("No tensorrt_llm installation found.") -# Infer the test environment for simplicity in local dev/testing. -try: - import vllm as _ + raise Exception("Unknown test environment") - TEST_BACKEND = "vllm" - TEST_MODEL = "llama-3.1-8b-instruct" -except ImportError: - print("No vllm installation found.") -try: - import tensorrt_llm as _ +def infer_test_model_repository(backend): + model_repository = str(Path(__file__).parent / f"{backend}_models") + return model_repository - TEST_BACKEND = "tensorrtllm" - TEST_MODEL = "tensorrt_llm_bls" -except ImportError: - print("No tensorrt_llm installation found.") + +# TODO: Refactor away from global variables +TEST_MODEL = os.environ.get("TEST_MODEL") +TEST_BACKEND = os.environ.get("TEST_BACKEND") +TEST_MODEL_REPOSITORY = os.environ.get("TEST_MODEL_REPOSITORY") + +TEST_TOKENIZER = os.environ.get( + "TEST_TOKENIZER", "meta-llama/Meta-Llama-3.1-8B-Instruct" +) +TEST_PROMPT = "What is machine learning?" +TEST_MESSAGES = [{"role": "user", "content": TEST_PROMPT}] if not TEST_BACKEND or not TEST_MODEL: - raise Exception("Unknown test environment") -### + TEST_BACKEND, TEST_MODEL = infer_test_environment() + +if not TEST_MODEL_REPOSITORY: + TEST_MODEL_REPOSITORY = infer_test_model_repository(TEST_BACKEND) # NOTE: OpenAI client requires actual server running, and won't work @@ -64,8 +84,14 @@ # only once for all the tests below. @pytest.fixture(scope="module") def server(): - model_repository = Path(__file__).parent / f"{TEST_BACKEND}_models" - args = ["--model-repository", model_repository, "--tokenizer", TEST_TOKENIZER] + args = [ + "--model-repository", + TEST_MODEL_REPOSITORY, + "--tokenizer", + TEST_TOKENIZER, + "--backend", + TEST_BACKEND, + ] with OpenAIServer(args) as openai_server: yield openai_server @@ -77,15 +103,21 @@ def server(): # the "server" when "starting the server" via TestClient. @pytest.fixture(scope="class") def fastapi_client_class_scope(): - model_repository = str(Path(__file__).parent / f"{TEST_BACKEND}_models") - server = setup_server(model_repository=model_repository) - app = setup_fastapi_app(tokenizer=TEST_TOKENIZER, server=server) + server = setup_server(model_repository=TEST_MODEL_REPOSITORY) + app = setup_fastapi_app( + tokenizer=TEST_TOKENIZER, server=server, backend=TEST_BACKEND + ) with TestClient(app) as test_client: yield test_client server.stop() +@pytest.fixture(scope="module") +def model_repository(): + return TEST_MODEL_REPOSITORY + + @pytest.fixture(scope="module") def model(): return TEST_MODEL diff --git a/python/openai/tests/test_chat_completions.py b/python/openai/tests/test_chat_completions.py index d15db4af2e..1ad70f88e1 100644 --- a/python/openai/tests/test_chat_completions.py +++ b/python/openai/tests/test_chat_completions.py @@ -468,9 +468,8 @@ def test_usage_response(self): class TestChatCompletionsTokenizers: # Re-use a single Triton server for different frontend configurations @pytest.fixture(scope="class") - def server(self, backend: str): - model_repository = str(Path(__file__).parent / f"{backend}_models") - server = setup_server(str(model_repository)) + def server(self, model_repository: str): + server = setup_server(model_repository) yield server server.stop() @@ -478,9 +477,13 @@ def server(self, backend: str): # apply chat templates, and for simplicity in determination, users should # define the tokenizer. So, explicitly raise an error if none is provided. def test_chat_completions_no_tokenizer( - self, server: tritonserver.Server, model: str, messages: List[dict] + self, + server: tritonserver.Server, + backend: str, + model: str, + messages: List[dict], ): - app = setup_fastapi_app(tokenizer="", server=server) + app = setup_fastapi_app(tokenizer="", server=server, backend=backend) with TestClient(app) as client: response = client.post( "/v1/chat/completions", @@ -493,6 +496,7 @@ def test_chat_completions_no_tokenizer( def test_chat_completions_custom_tokenizer( self, server: tritonserver.Server, + backend: str, tokenizer_model: str, model: str, messages: List[dict], @@ -506,8 +510,12 @@ def test_chat_completions_custom_tokenizer( # Compare the downloaded tokenizer response against remote HF equivalent # to assert equivalent functionality in responses and chat template. - app_local = setup_fastapi_app(tokenizer=custom_tokenizer_path, server=server) - app_hf = setup_fastapi_app(tokenizer=tokenizer_model, server=server) + app_local = setup_fastapi_app( + tokenizer=custom_tokenizer_path, server=server, backend=backend + ) + app_hf = setup_fastapi_app( + tokenizer=tokenizer_model, server=server, backend=backend + ) responses = [] with TestClient(app_local) as client_local, TestClient(app_hf) as client_hf: @@ -534,6 +542,15 @@ def equal_dicts(d1, d2, ignore_keys): def test_chat_completions_invalid_chat_tokenizer( self, server: tritonserver.Server, model: str, messages: List[dict] ): + # NOTE: Use of apply_chat_template on a tokenizer that doesn't support it + # is a warning prior to transformers 4.44, and an error afterwards. + # NOTE: Can remove after both TRT-LLM and VLLM containers have this version. + import transformers + + print(f"{transformers.__version__=}") + if transformers.__version__ < "4.44.0": + pytest.xfail() + # Pick a tokenizer with no chat template defined invalid_chat_tokenizer = "gpt2" app = setup_fastapi_app(tokenizer=invalid_chat_tokenizer, server=server) diff --git a/python/openai/tests/test_observability.py b/python/openai/tests/test_observability.py index 7c267bc460..b15b64e735 100644 --- a/python/openai/tests/test_observability.py +++ b/python/openai/tests/test_observability.py @@ -43,7 +43,7 @@ def client(self): # TODO: Cleanup, mock server/engine, etc. model_repository = Path(__file__).parent / "test_models" server = setup_server(str(model_repository)) - app = setup_fastapi_app(tokenizer="", server=server) + app = setup_fastapi_app(tokenizer="", server=server, backend=None) with TestClient(app) as test_client: yield test_client diff --git a/python/openai/tests/utils.py b/python/openai/tests/utils.py index d41b064db4..3106586d63 100644 --- a/python/openai/tests/utils.py +++ b/python/openai/tests/utils.py @@ -52,8 +52,10 @@ def setup_server(model_repository: str): return server -def setup_fastapi_app(tokenizer: str, server: tritonserver.Server): - engine: TritonLLMEngine = TritonLLMEngine(server=server, tokenizer=tokenizer) +def setup_fastapi_app(tokenizer: str, server: tritonserver.Server, backend: str): + engine: TritonLLMEngine = TritonLLMEngine( + server=server, tokenizer=tokenizer, backend=backend + ) frontend: FastApiFrontend = FastApiFrontend(engine=engine) return frontend.app From a1484e4155951ad308e289388c17f12c17f57774 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Wed, 4 Sep 2024 15:57:01 -0700 Subject: [PATCH 64/80] Fix tokenizer test, remove TODO --- python/openai/openai_frontend/engine/triton_engine.py | 1 - python/openai/tests/test_chat_completions.py | 10 ++++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/python/openai/openai_frontend/engine/triton_engine.py b/python/openai/openai_frontend/engine/triton_engine.py index 0a81d98d5e..b96c9da0c7 100644 --- a/python/openai/openai_frontend/engine/triton_engine.py +++ b/python/openai/openai_frontend/engine/triton_engine.py @@ -236,7 +236,6 @@ def _determine_request_converter(self, backend: str): return _create_trtllm_inference_request def _get_tokenizer(self, tokenizer_name: str): - # TODO: Consider support for custom tokenizers tokenizer = None if tokenizer_name: tokenizer = get_tokenizer(tokenizer_name) diff --git a/python/openai/tests/test_chat_completions.py b/python/openai/tests/test_chat_completions.py index 1ad70f88e1..973c719497 100644 --- a/python/openai/tests/test_chat_completions.py +++ b/python/openai/tests/test_chat_completions.py @@ -540,7 +540,11 @@ def equal_dicts(d1, d2, ignore_keys): ) def test_chat_completions_invalid_chat_tokenizer( - self, server: tritonserver.Server, model: str, messages: List[dict] + self, + server: tritonserver.Server, + backend: str, + model: str, + messages: List[dict], ): # NOTE: Use of apply_chat_template on a tokenizer that doesn't support it # is a warning prior to transformers 4.44, and an error afterwards. @@ -553,7 +557,9 @@ def test_chat_completions_invalid_chat_tokenizer( # Pick a tokenizer with no chat template defined invalid_chat_tokenizer = "gpt2" - app = setup_fastapi_app(tokenizer=invalid_chat_tokenizer, server=server) + app = setup_fastapi_app( + tokenizer=invalid_chat_tokenizer, server=server, backend=backend + ) with TestClient(app) as client: response = client.post( "/v1/chat/completions", From 33eee48bba96cc62cb7d8313abeae20a3e44970f Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Tue, 24 Sep 2024 17:40:17 -0700 Subject: [PATCH 65/80] perf: Improve chat completions performance at high concurrency (#7653) --- .gitignore | 2 + .../openai_frontend/engine/triton_engine.py | 74 +++++++++++-------- .../{ => engine}/utils/__init__.py | 0 .../{ => engine}/utils/tokenizer.py | 0 .../{ => engine}/utils/triton.py | 48 ++++++++++-- .../frontend/fastapi/routers/chat.py | 4 +- .../frontend/fastapi/routers/completions.py | 4 +- .../frontend/fastapi_frontend.py | 6 +- python/openai/tests/test_chat_completions.py | 3 +- python/openai/tests/test_completions.py | 25 ++++++- .../llama-3.1-8b-instruct/1/model.json | 2 +- 11 files changed, 122 insertions(+), 46 deletions(-) rename python/openai/openai_frontend/{ => engine}/utils/__init__.py (100%) rename python/openai/openai_frontend/{ => engine}/utils/tokenizer.py (100%) rename python/openai/openai_frontend/{ => engine}/utils/triton.py (72%) diff --git a/.gitignore b/.gitignore index 205e95606d..bce94b6830 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,8 @@ tmp *.xml test_results.txt artifacts +cprofile +*.prof # Test exclusions qa/L0_openai/openai diff --git a/python/openai/openai_frontend/engine/triton_engine.py b/python/openai/openai_frontend/engine/triton_engine.py index b96c9da0c7..e6f4f0bbe3 100644 --- a/python/openai/openai_frontend/engine/triton_engine.py +++ b/python/openai/openai_frontend/engine/triton_engine.py @@ -30,10 +30,17 @@ import time import uuid from dataclasses import dataclass -from typing import Any, Callable, Dict, Iterator, List, Optional +from typing import Any, AsyncIterable, AsyncIterator, Callable, Dict, List, Optional import tritonserver from engine.engine import LLMEngine +from engine.utils.tokenizer import get_tokenizer +from engine.utils.triton import ( + _create_trtllm_inference_request, + _create_vllm_inference_request, + _get_output, + _validate_triton_responses_non_streaming, +) from schemas.openai import ( ChatCompletionChoice, ChatCompletionFinishReason, @@ -50,13 +57,6 @@ Model, ObjectType, ) -from utils.tokenizer import get_tokenizer -from utils.triton import ( - _create_trtllm_inference_request, - _create_vllm_inference_request, - _get_output, - _validate_triton_responses_non_streaming, -) # TODO: Improve type hints @@ -111,9 +111,9 @@ def models(self) -> List[Model]: return models - def chat( + async def chat( self, request: CreateChatCompletionRequest - ) -> CreateChatCompletionResponse | Iterator[str]: + ) -> CreateChatCompletionResponse | AsyncIterator[str]: metadata = self.model_metadata.get(request.model) self._validate_chat_request(request, metadata) @@ -130,7 +130,7 @@ def chat( ) # Convert to Triton request format and perform inference - responses = metadata.model.infer( + responses = metadata.model.async_infer( metadata.request_converter(metadata.model, prompt, request) ) @@ -148,7 +148,7 @@ def chat( ) # Response validation with decoupled models in mind - responses = list(responses) + responses = [response async for response in responses] _validate_triton_responses_non_streaming(responses) response = responses[0] text = _get_output(response) @@ -171,15 +171,15 @@ def chat( object=ObjectType.chat_completion, ) - def completion( + async def completion( self, request: CreateCompletionRequest - ) -> CreateCompletionResponse | Iterator[str]: + ) -> CreateCompletionResponse | AsyncIterator[str]: # Validate request and convert to Triton format metadata = self.model_metadata.get(request.model) self._validate_completion_request(request, metadata) # Convert to Triton request format and perform inference - responses = metadata.model.infer( + responses = metadata.model.async_infer( metadata.request_converter(metadata.model, request.prompt, request) ) @@ -192,7 +192,7 @@ def completion( ) # Response validation with decoupled models in mind - responses = list(responses) + responses = [response async for response in responses] _validate_triton_responses_non_streaming(responses) response = responses[0] text = _get_output(response) @@ -319,19 +319,24 @@ def _get_nth_streaming_chat_response( ) return chunk - def _streaming_chat_iterator( - self, request_id: str, created: int, model: str, role: str, responses: List - ) -> Iterator[str]: + async def _streaming_chat_iterator( + self, + request_id: str, + created: int, + model: str, + role: str, + responses: AsyncIterable, + ) -> AsyncIterator[str]: chunk = self._get_first_streaming_chat_response( request_id, created, model, role ) - yield f"data: {chunk.json(exclude_unset=True)}\n\n" + yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" - for response in responses: + async for response in responses: chunk = self._get_nth_streaming_chat_response( request_id, created, model, response ) - yield f"data: {chunk.json(exclude_unset=True)}\n\n" + yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" yield "data: [DONE]\n\n" @@ -357,15 +362,17 @@ def _validate_chat_request( # Reject unsupported features if requested if request.n and request.n > 1: - raise Exception("Only single choice is supported") + raise Exception( + f"Received n={request.n}, but only single choice (n=1) is currently supported" + ) if request.logit_bias is not None or request.logprobs: raise Exception("logit bias and log probs not currently supported") - def _streaming_completion_iterator( - self, request_id: str, created: int, model: str, responses: List - ) -> Iterator[str]: - for response in responses: + async def _streaming_completion_iterator( + self, request_id: str, created: int, model: str, responses: AsyncIterable + ) -> AsyncIterator[str]: + async for response in responses: text = _get_output(response) choice = Choice( finish_reason=FinishReason.stop if response.final else None, @@ -373,7 +380,7 @@ def _streaming_completion_iterator( logprobs=None, text=text, ) - response = CreateCompletionResponse( + chunk = CreateCompletionResponse( id=request_id, choices=[choice], system_fingerprint=None, @@ -382,7 +389,7 @@ def _streaming_completion_iterator( model=model, ) - yield f"data: {response.json(exclude_unset=True)}\n\n" + yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" yield "data: [DONE]\n\n" @@ -414,7 +421,14 @@ def _validate_completion_request( raise Exception("only single string input is supported") if request.n and request.n > 1: - raise Exception("Only single choice is supported") + raise Exception( + f"Received n={request.n}, but only single choice (n=1) is currently supported" + ) + + if request.best_of and request.best_of > 1: + raise Exception( + f"Received best_of={request.best_of}, but only single choice (best_of=1) is currently supported" + ) if request.logit_bias is not None or request.logprobs is not None: raise Exception("logit bias and log probs not supported") diff --git a/python/openai/openai_frontend/utils/__init__.py b/python/openai/openai_frontend/engine/utils/__init__.py similarity index 100% rename from python/openai/openai_frontend/utils/__init__.py rename to python/openai/openai_frontend/engine/utils/__init__.py diff --git a/python/openai/openai_frontend/utils/tokenizer.py b/python/openai/openai_frontend/engine/utils/tokenizer.py similarity index 100% rename from python/openai/openai_frontend/utils/tokenizer.py rename to python/openai/openai_frontend/engine/utils/tokenizer.py diff --git a/python/openai/openai_frontend/utils/triton.py b/python/openai/openai_frontend/engine/utils/triton.py similarity index 72% rename from python/openai/openai_frontend/utils/triton.py rename to python/openai/openai_frontend/engine/utils/triton.py index e7c97e7c53..540d763252 100644 --- a/python/openai/openai_frontend/utils/triton.py +++ b/python/openai/openai_frontend/engine/utils/triton.py @@ -24,7 +24,8 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from typing import List +import ctypes +from typing import Iterable, List import numpy as np import tritonserver @@ -79,13 +80,50 @@ def _create_trtllm_inference_request( return model.create_request(inputs=inputs) +def _construct_string_from_pointer(pointer: int, size: int) -> str: + """Constructs a Python string from a C pointer and size.""" + + # Create a ctypes string buffer + string_buffer = ctypes.create_string_buffer(size + 1) # +1 for null terminator + + # Copy the data from the pointer to the buffer + ctypes.memmove(string_buffer, pointer, size) + + # Convert the buffer to a Python string + return string_buffer.value.decode("utf-8") # Adjust encoding if needed + + +def _get_volume(shape: Iterable[int]) -> int: + volume = 1 + for dim in shape: + volume *= dim + + return volume + + # TODO: Use tritonserver.InferenceResponse when support is published def _get_output(response: tritonserver._api._response.InferenceResponse): if "text_output" in response.outputs: - try: - return response.outputs["text_output"].to_string_array()[0] - except Exception: - return str(response.outputs["text_output"].to_bytes_array()[0]) + # Alternative method, creates the same string, but goes through + # deserialization, numpy, and dlpack overhead: + # return response.outputs["text_output"].to_bytes_array()[0].decode("utf-8") + + # The following optimization to read string directly from buffer assumes + # there is only a single string, so enforce it to avoid obscure errors. + tensor = response.outputs["text_output"] + volume = _get_volume(tensor.shape) + if volume != 1: + raise Exception( + f"Expected to find 1 string in the output, found {volume} instead." + ) + if tensor.size < 4: + raise Exception( + f"Expected string buffer to contain its serialized byte size, but found size of {tensor.size}." + ) + + # NOTE: Account for serialized byte string length in first 4 bytes of buffer + return _construct_string_from_pointer(tensor.data_ptr + 4, tensor.size - 4) + return "" diff --git a/python/openai/openai_frontend/frontend/fastapi/routers/chat.py b/python/openai/openai_frontend/frontend/fastapi/routers/chat.py index b4985fcc92..0f72047a5e 100644 --- a/python/openai/openai_frontend/frontend/fastapi/routers/chat.py +++ b/python/openai/openai_frontend/frontend/fastapi/routers/chat.py @@ -34,7 +34,7 @@ @router.post( "/v1/chat/completions", response_model=CreateChatCompletionResponse, tags=["Chat"] ) -def create_chat_completion( +async def create_chat_completion( request: CreateChatCompletionRequest, raw_request: Request, ) -> CreateChatCompletionResponse | StreamingResponse: @@ -45,7 +45,7 @@ def create_chat_completion( raise HTTPException(status_code=500, detail="No attached inference engine") try: - response = raw_request.app.engine.chat(request) + response = await raw_request.app.engine.chat(request) if request.stream: return StreamingResponse(response, media_type="text/event-stream") return response diff --git a/python/openai/openai_frontend/frontend/fastapi/routers/completions.py b/python/openai/openai_frontend/frontend/fastapi/routers/completions.py index 24f54eb7c8..ade89a47cc 100644 --- a/python/openai/openai_frontend/frontend/fastapi/routers/completions.py +++ b/python/openai/openai_frontend/frontend/fastapi/routers/completions.py @@ -34,7 +34,7 @@ @router.post( "/v1/completions", response_model=CreateCompletionResponse, tags=["Completions"] ) -def create_completion( +async def create_completion( request: CreateCompletionRequest, raw_request: Request ) -> CreateCompletionResponse | StreamingResponse: """ @@ -44,7 +44,7 @@ def create_completion( raise HTTPException(status_code=500, detail="No attached inference engine") try: - response = raw_request.app.engine.completion(request) + response = await raw_request.app.engine.completion(request) if request.stream: return StreamingResponse(response, media_type="text/event-stream") return response diff --git a/python/openai/openai_frontend/frontend/fastapi_frontend.py b/python/openai/openai_frontend/frontend/fastapi_frontend.py index fd3dc888ed..adee4cbab3 100644 --- a/python/openai/openai_frontend/frontend/fastapi_frontend.py +++ b/python/openai/openai_frontend/frontend/fastapi_frontend.py @@ -55,13 +55,15 @@ def __del__(self): self.stop() def start(self): - uvicorn.run( - self.app, + config = uvicorn.Config( + app=self.app, host=self.host, port=self.port, log_level=self.log_level, timeout_keep_alive=5, ) + server = uvicorn.Server(config) + server.run() def stop(self): # NOTE: If the frontend owned the engine, it could do cleanup here. diff --git a/python/openai/tests/test_chat_completions.py b/python/openai/tests/test_chat_completions.py index 973c719497..2c5d220c75 100644 --- a/python/openai/tests/test_chat_completions.py +++ b/python/openai/tests/test_chat_completions.py @@ -94,6 +94,7 @@ def test_chat_completions_system_prompt_only(self, client, model: str): ("top_p", 0.9), ("frequency_penalty", 0.5), ("presence_penalty", 0.2), + ("n", 1), # logprobs is a boolean for chat completions ("logprobs", True), ("logit_bias", {"0": 0}), @@ -415,7 +416,7 @@ def test_chat_completions_multiple_choices( ) assert response.status_code == 400 - assert response.json()["detail"] == "Only single choice is supported" + assert "only single choice" in response.json()["detail"] @pytest.mark.skip(reason="Not Implemented Yet") def test_chat_completions_streaming(self, client): diff --git a/python/openai/tests/test_completions.py b/python/openai/tests/test_completions.py index b6bc38dd58..d89ff4701e 100644 --- a/python/openai/tests/test_completions.py +++ b/python/openai/tests/test_completions.py @@ -56,6 +56,8 @@ def test_completions_defaults(self, client, model: str, prompt: str): ("top_p", 0.9), ("frequency_penalty", 0.5), ("presence_penalty", 0.2), + ("best_of", 1), + ("n", 1), # logprobs is an integer for completions ("logprobs", 5), ("logit_bias", {"0": 0}), @@ -331,13 +333,30 @@ def test_no_prompt(self, client, model: str): # 422 Error returned by schema validation assert response.status_code == 422 - def test_completions_multiple_choices(self, client, model: str, prompt: str): + @pytest.mark.parametrize( + "sampling_parameter_dict", + [ + # Each individual parameter should fail for > 1 for now + {"n": 2}, + {"best_of": 2}, + {"n": 2, "best_of": 2}, + # When individual params > 1 are supported, best_of < n should fail + {"n": 2, "best_of": 1}, + ], + ) + def test_completions_multiple_choices( + self, client, sampling_parameter_dict: dict, model: str, prompt: str + ): response = client.post( - "/v1/completions", json={"model": model, "prompt": prompt, "n": 2} + "/v1/completions", + json={"model": model, "prompt": prompt, **sampling_parameter_dict}, ) + print("Response:", response.json()) + # FIXME: Add support and test for success + # Expected to fail when n or best_of > 1, only single choice supported for now assert response.status_code == 400 - assert response.json()["detail"] == "Only single choice is supported" + assert "only single choice" in response.json()["detail"] @pytest.mark.skip(reason="Not Implemented Yet") def test_lora(self): diff --git a/python/openai/tests/vllm_models/llama-3.1-8b-instruct/1/model.json b/python/openai/tests/vllm_models/llama-3.1-8b-instruct/1/model.json index 00f18b88bd..cb9b14c765 100644 --- a/python/openai/tests/vllm_models/llama-3.1-8b-instruct/1/model.json +++ b/python/openai/tests/vllm_models/llama-3.1-8b-instruct/1/model.json @@ -1 +1 @@ -{"model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "disable_log_requests": true, "gpu_memory_utilization": 0.85} \ No newline at end of file +{"model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "disable_log_requests": true, "gpu_memory_utilization": 0.9} From 0882b6085e2e3fde6a250ca674c6aa98bd0ecf1e Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Wed, 25 Sep 2024 14:43:01 -0700 Subject: [PATCH 66/80] review feedback: use _to_string helper function, add some clarifying docs and type hints --- .../openai_frontend/engine/utils/triton.py | 44 ++++++++++++------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/python/openai/openai_frontend/engine/utils/triton.py b/python/openai/openai_frontend/engine/utils/triton.py index 540d763252..4bfa9044a2 100644 --- a/python/openai/openai_frontend/engine/utils/triton.py +++ b/python/openai/openai_frontend/engine/utils/triton.py @@ -101,28 +101,38 @@ def _get_volume(shape: Iterable[int]) -> int: return volume +def _to_string(tensor: tritonserver.Tensor) -> str: + # FIXME: This could be a bit more robust by reading byte size from first + # 4 bytes and then just reading the first string, rather than assuming + # single string, assuming it's of similar performance to do so. + + # The following optimization to read string directly from buffer assumes + # there is only a single string, so enforce it to avoid obscure errors. + volume = _get_volume(tensor.shape) + if volume != 1: + raise Exception( + f"Expected to find 1 string in the output, found {volume} instead." + ) + if tensor.size < 4: + raise Exception( + f"Expected string buffer to contain its serialized byte size, but found size of {tensor.size}." + ) + + # NOTE: +/- 4 accounts for serialized byte string length in first 4 bytes of buffer + return _construct_string_from_pointer(tensor.data_ptr + 4, tensor.size - 4) + + # TODO: Use tritonserver.InferenceResponse when support is published -def _get_output(response: tritonserver._api._response.InferenceResponse): +def _get_output(response: tritonserver._api._response.InferenceResponse) -> str: if "text_output" in response.outputs: + tensor = response.outputs["text_output"] + # Alternative method, creates the same string, but goes through # deserialization, numpy, and dlpack overhead: - # return response.outputs["text_output"].to_bytes_array()[0].decode("utf-8") + # return tensor.to_bytes_array()[0].decode("utf-8") - # The following optimization to read string directly from buffer assumes - # there is only a single string, so enforce it to avoid obscure errors. - tensor = response.outputs["text_output"] - volume = _get_volume(tensor.shape) - if volume != 1: - raise Exception( - f"Expected to find 1 string in the output, found {volume} instead." - ) - if tensor.size < 4: - raise Exception( - f"Expected string buffer to contain its serialized byte size, but found size of {tensor.size}." - ) - - # NOTE: Account for serialized byte string length in first 4 bytes of buffer - return _construct_string_from_pointer(tensor.data_ptr + 4, tensor.size - 4) + # Optimized method + return _to_string(tensor) return "" From f073fbf2105e3d628547aee8de6b5148cbf274be Mon Sep 17 00:00:00 2001 From: KrishnanPrash <140860868+KrishnanPrash@users.noreply.github.com> Date: Thu, 26 Sep 2024 15:54:09 -0700 Subject: [PATCH 67/80] feat: KServe Bindings to start tritonfrontend (#7662) Co-authored-by: Ryan McCormick --- python/openai/README.md | 8 ++--- python/openai/openai_frontend/main.py | 46 ++++++++++++++++++++++++++- 2 files changed, 49 insertions(+), 5 deletions(-) diff --git a/python/openai/README.md b/python/openai/README.md index 58ec3e7ac5..5ef45ab5aa 100644 --- a/python/openai/README.md +++ b/python/openai/README.md @@ -36,7 +36,7 @@ python3 openai_frontend/main.py --model-repository tests/vllm_models/ --tokenize - Note the use of `jq` is optional, but provides a nicely formatted output for JSON responses. ```bash MODEL="llama-3.1-8b-instruct" -curl -s http://localhost:8000/v1/chat/completions -H 'Content-Type: application/json' -d '{ +curl -s http://localhost:9000/v1/chat/completions -H 'Content-Type: application/json' -d '{ "model": "'${MODEL}'", "messages": [{"role": "user", "content": "Say this is a test!"}] }' | jq @@ -46,7 +46,7 @@ curl -s http://localhost:8000/v1/chat/completions -H 'Content-Type: application/ - Note the use of `jq` is optional, but provides a nicely formatted output for JSON responses. ```bash MODEL="llama-3.1-8b-instruct" -curl -s http://localhost:8000/v1/completions -H 'Content-Type: application/json' -d '{ +curl -s http://localhost:9000/v1/completions -H 'Content-Type: application/json' -d '{ "model": "'${MODEL}'", "prompt": "Machine learning is" }' | jq @@ -73,7 +73,7 @@ genai-perf \ from openai import OpenAI client = OpenAI( - base_url="http://localhost:8000/v1", + base_url="http://localhost:9000/v1", api_key="EMPTY", ) @@ -138,7 +138,7 @@ python3 openai_frontend/main.py --model-repository tests/tensorrtllm_models/ --t - Note the use of `jq` is optional, but provides a nicely formatted output for JSON responses. ```bash MODEL="tensorrt_llm_bls" -curl -s http://localhost:8000/v1/chat/completions -H 'Content-Type: application/json' -d '{ +curl -s http://localhost:9000/v1/chat/completions -H 'Content-Type: application/json' -d '{ "model": "'${MODEL}'", "messages": [{"role": "user", "content": "Say this is a test!"}] }' | jq diff --git a/python/openai/openai_frontend/main.py b/python/openai/openai_frontend/main.py index 6f94f81ac5..67aeb909bc 100755 --- a/python/openai/openai_frontend/main.py +++ b/python/openai/openai_frontend/main.py @@ -45,6 +45,28 @@ def signal_handler(server, frontend, signal, frame): server.stop() +def start_kserve_frontends(server, args): + http_service, grpc_service = None, None + try: + from tritonfrontend import KServeGrpc, KServeHttp + + http_options = KServeHttp.Options(port=args.kserve_http_port) + http_service = KServeHttp.Server(server, http_options) + http_service.start() + + grpc_options = KServeGrpc.Options(port=args.kserve_grpc_port) + grpc_service = KServeGrpc.Server(server, grpc_options) + grpc_service.start() + + except ModuleNotFoundError: + print( + "[WARNING] The 'tritonfrontend' package was not found. " + "KServe frontends won't be available through this application without it. " + "Check /opt/tritonserver/python for tritonfrontend*.whl and pip install it if present." + ) + return http_service, grpc_service + + def parse_args(): parser = argparse.ArgumentParser( description="Triton OpenAI Compatible RESTful API server." @@ -52,7 +74,9 @@ def parse_args(): # Uvicorn uvicorn_group = parser.add_argument_group("Uvicorn") uvicorn_group.add_argument("--host", type=str, default=None, help="host name") - uvicorn_group.add_argument("--port", type=int, default=8000, help="port number") + uvicorn_group.add_argument( + "--openai-port", type=int, default=9000, help="OpenAI HTTP port (default: 9000)" + ) uvicorn_group.add_argument( "--uvicorn-log-level", type=str, @@ -88,6 +112,19 @@ def parse_args(): choices=["vllm", "tensorrtllm"], help="Manual override of Triton backend request format (inputs/output names) to use for inference", ) + # Should this be wrapped in an if block that only enters if import tritonfrontend worked + triton_group.add_argument( + "--kserve-http-port", + type=int, + default=8000, + help="KServe Predict v2 HTTP port (default: 8000)", + ) + triton_group.add_argument( + "--kserve-grpc-port", + type=int, + default=8001, + help="KServe Predict v2 GRPC port (default: 8001)", + ) return parser.parse_args() @@ -104,6 +141,8 @@ def main(): log_error=True, ).start(wait_until_ready=True) + http_service, grpc_service = start_kserve_frontends(server, args) + # Wrap Triton Inference Server in an interface-conforming "LLMEngine" engine: TritonLLMEngine = TritonLLMEngine( server=server, tokenizer=args.tokenizer, backend=args.backend @@ -121,6 +160,11 @@ def main(): # Blocking call until killed or interrupted with SIGINT frontend.start() + if http_service: + http_service.stop() + if grpc_service: + grpc_service.stop() + if __name__ == "__main__": main() From 2d0f7e67e119b2935564b7fc472e61a7419b4b19 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Fri, 27 Sep 2024 12:44:36 -0700 Subject: [PATCH 68/80] chore: Fix argparse typo, cleanup argparse groups, make kserve frontends optional (#7663) --- python/openai/README.md | 24 +++++- python/openai/openai_frontend/main.py | 116 +++++++++++++++++--------- python/openai/tests/conftest.py | 4 + python/openai/tests/utils.py | 3 +- 4 files changed, 103 insertions(+), 44 deletions(-) diff --git a/python/openai/README.md b/python/openai/README.md index 5ef45ab5aa..8a59e23bb0 100644 --- a/python/openai/README.md +++ b/python/openai/README.md @@ -29,7 +29,7 @@ docker run -it --net=host --gpus all --rm \ cd openai/ # NOTE: Adjust the --tokenizer based on the model being used -python3 openai_frontend/main.py --model-repository tests/vllm_models/ --tokenizer meta-llama/Meta-Llama-3.1-8B-Instruct +python3 openai_frontend/main.py --model-repository tests/vllm_models --tokenizer meta-llama/Meta-Llama-3.1-8B-Instruct ``` 3. Send a `/v1/chat/completions` request: @@ -131,7 +131,7 @@ docker run -it --net=host --gpus all --rm \ cd openai/ # NOTE: Adjust the --tokenizer based on the model being used -python3 openai_frontend/main.py --model-repository tests/tensorrtllm_models/ --tokenizer meta-llama/Meta-Llama-3.1-8B-Instruct +python3 openai_frontend/main.py --model-repository tests/tensorrtllm_models --tokenizer meta-llama/Meta-Llama-3.1-8B-Instruct ``` 3. Send a `/v1/chat/completions` request: @@ -146,3 +146,23 @@ curl -s http://localhost:9000/v1/chat/completions -H 'Content-Type: application/ The other examples should be the same as vLLM, except that you should set `MODEL="tensorrt_llm_bls"`, everywhere applicable as seen in the example request above. + +## KServe Frontends + +To support serving requests through both the OpenAI-Compatible and +KServe Predict v2 frontends to the same running Triton Inference Server, +the `tritonfrontend` python bindings are included for optional use in this +application as well. + +You can opt-in to including these additional frontends, assuming `tritonfrontend` +is installed, with `--enable-kserve-frontends` like below: + +``` +python3 openai_frontend/main.py \ + --model-repository tests/vllm_models \ + --tokenizer meta-llama/Meta-Llama-3.1-8B-Instruct \ + --enable-kserve-frontends +``` + +See `python3 openai_frontend/main.py --help` for more information on the +available arguments and default values. diff --git a/python/openai/openai_frontend/main.py b/python/openai/openai_frontend/main.py index 67aeb909bc..cbd62790a1 100755 --- a/python/openai/openai_frontend/main.py +++ b/python/openai/openai_frontend/main.py @@ -35,12 +35,26 @@ from frontend.fastapi_frontend import FastApiFrontend -def signal_handler(server, frontend, signal, frame): +def signal_handler( + server, openai_frontend, kserve_http_frontend, kserve_grpc_frontend, signal, frame +): print(f"Received {signal=}, {frame=}") - # Graceful Shutdown - print("Shutting down OpenAI Frontend...") - frontend.stop() + shutdown(server, openai_frontend, kserve_http_frontend, kserve_grpc_frontend) + + +def shutdown(server, openai_frontend, kserve_http, kserve_grpc): + print("Shutting down Triton OpenAI-Compatible Frontend...") + openai_frontend.stop() + + if kserve_http: + print("Shutting down Triton KServe HTTP Frontend...") + kserve_http.stop() + + if kserve_grpc: + print("Shutting down Triton KServe GRPC Frontend...") + kserve_grpc.stop() + print("Shutting down Triton Inference Server...") server.stop() @@ -50,11 +64,11 @@ def start_kserve_frontends(server, args): try: from tritonfrontend import KServeGrpc, KServeHttp - http_options = KServeHttp.Options(port=args.kserve_http_port) + http_options = KServeHttp.Options(address=args.host, port=args.kserve_http_port) http_service = KServeHttp.Server(server, http_options) http_service.start() - grpc_options = KServeGrpc.Options(port=args.kserve_grpc_port) + grpc_options = KServeGrpc.Options(address=args.host, port=args.kserve_grpc_port) grpc_service = KServeGrpc.Server(server, grpc_options) grpc_service.start() @@ -69,30 +83,11 @@ def start_kserve_frontends(server, args): def parse_args(): parser = argparse.ArgumentParser( - description="Triton OpenAI Compatible RESTful API server." - ) - # Uvicorn - uvicorn_group = parser.add_argument_group("Uvicorn") - uvicorn_group.add_argument("--host", type=str, default=None, help="host name") - uvicorn_group.add_argument( - "--openai-port", type=int, default=9000, help="OpenAI HTTP port (default: 9000)" - ) - uvicorn_group.add_argument( - "--uvicorn-log-level", - type=str, - default="info", - choices=["debug", "info", "warning", "error", "critical", "trace"], - help="log level for uvicorn", + description="Triton Inference Server with OpenAI-Compatible RESTful API server." ) - # Triton + # Triton Inference Server triton_group = parser.add_argument_group("Triton Inference Server") - triton_group.add_argument( - "--tritonserver-log-verbose-level", - type=int, - default=0, - help="The tritonserver log verbosity level", - ) triton_group.add_argument( "--model-repository", type=str, @@ -112,14 +107,46 @@ def parse_args(): choices=["vllm", "tensorrtllm"], help="Manual override of Triton backend request format (inputs/output names) to use for inference", ) - # Should this be wrapped in an if block that only enters if import tritonfrontend worked triton_group.add_argument( + "--tritonserver-log-verbose-level", + type=int, + default=0, + help="The tritonserver log verbosity level", + ) + triton_group.add_argument( + "--host", + type=str, + default="0.0.0.0", + help="Address/host of frontends (default: '0.0.0.0')", + ) + + # OpenAI-Compatible Frontend (FastAPI) + openai_group = parser.add_argument_group("Triton OpenAI-Compatible Frontend") + openai_group.add_argument( + "--openai-port", type=int, default=9000, help="OpenAI HTTP port (default: 9000)" + ) + openai_group.add_argument( + "--uvicorn-log-level", + type=str, + default="info", + choices=["debug", "info", "warning", "error", "critical", "trace"], + help="log level for uvicorn", + ) + + # KServe Predict v2 Frontend + kserve_group = parser.add_argument_group("Triton KServe Frontend") + kserve_group.add_argument( + "--enable-kserve-frontends", + action="store_true", + help="Enable KServe Predict v2 HTTP/GRPC frontends (disabled by default)", + ) + kserve_group.add_argument( "--kserve-http-port", type=int, default=8000, help="KServe Predict v2 HTTP port (default: 8000)", ) - triton_group.add_argument( + kserve_group.add_argument( "--kserve-grpc-port", type=int, default=8001, @@ -141,29 +168,36 @@ def main(): log_error=True, ).start(wait_until_ready=True) - http_service, grpc_service = start_kserve_frontends(server, args) - # Wrap Triton Inference Server in an interface-conforming "LLMEngine" engine: TritonLLMEngine = TritonLLMEngine( server=server, tokenizer=args.tokenizer, backend=args.backend ) # Attach TritonLLMEngine as the backbone for inference and model management - frontend: FastApiFrontend = FastApiFrontend( - engine=engine, host=args.host, port=args.port, log_level=args.uvicorn_log_level + openai_frontend: FastApiFrontend = FastApiFrontend( + engine=engine, + host=args.host, + port=args.openai_port, + log_level=args.uvicorn_log_level, ) + # Optionally expose Triton KServe HTTP/GRPC Frontends + kserve_http, kserve_grpc = None, None + if args.enable_kserve_frontends: + kserve_http, kserve_grpc = start_kserve_frontends(server, args) + # Gracefully shutdown when receiving signals for testing and interactive use - signal.signal(signal.SIGINT, partial(signal_handler, server, frontend)) - signal.signal(signal.SIGTERM, partial(signal_handler, server, frontend)) + signal.signal( + signal.SIGINT, + partial(signal_handler, server, openai_frontend, kserve_http, kserve_grpc), + ) + signal.signal( + signal.SIGTERM, + partial(signal_handler, server, openai_frontend, kserve_http, kserve_grpc), + ) # Blocking call until killed or interrupted with SIGINT - frontend.start() - - if http_service: - http_service.stop() - if grpc_service: - grpc_service.stop() + openai_frontend.start() if __name__ == "__main__": diff --git a/python/openai/tests/conftest.py b/python/openai/tests/conftest.py index 43ccd27908..9ea9a5634e 100644 --- a/python/openai/tests/conftest.py +++ b/python/openai/tests/conftest.py @@ -92,6 +92,10 @@ def server(): "--backend", TEST_BACKEND, ] + # TODO: Incorporate kserve frontend binding smoke tests to catch any + # breakage with default values or slight cli arg variations + extra_args = ["--enable-kserve-frontends"] + args += extra_args with OpenAIServer(args) as openai_server: yield openai_server diff --git a/python/openai/tests/utils.py b/python/openai/tests/utils.py index 3106586d63..fdffcc5ea9 100644 --- a/python/openai/tests/utils.py +++ b/python/openai/tests/utils.py @@ -71,8 +71,9 @@ def __init__( *, env_dict: Optional[Dict[str, str]] = None, ) -> None: + # TODO: Incorporate caller's cli_args passed to this instance instead self.host = "localhost" - self.port = 8000 + self.port = 9000 env = os.environ.copy() if env_dict is not None: From 78e571d39bb55909880b5e9df6a91a78cdbb835a Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Mon, 7 Oct 2024 16:03:59 -0700 Subject: [PATCH 69/80] fix: Support sampling parameters of type List for vLLM backend (stop words) (#7682) --- .../openai_frontend/engine/utils/triton.py | 20 +++++---- .../openai/openai_frontend/schemas/openai.py | 1 + python/openai/tests/test_chat_completions.py | 17 +++++++- python/openai/tests/test_openai_client.py | 41 ++++++++++++++++--- 4 files changed, 64 insertions(+), 15 deletions(-) diff --git a/python/openai/openai_frontend/engine/utils/triton.py b/python/openai/openai_frontend/engine/utils/triton.py index 4bfa9044a2..e55ace5a45 100644 --- a/python/openai/openai_frontend/engine/utils/triton.py +++ b/python/openai/openai_frontend/engine/utils/triton.py @@ -40,18 +40,22 @@ def _create_vllm_inference_request( # NOTE: The exclude_none is important, as internals may not support # values of NoneType at this time. - sampling_parameters = request.model_dump( + sampling_parameters = request.model_dump_json( exclude=excludes, exclude_none=True, ) - inputs["text_input"] = [prompt] - inputs["stream"] = [request.stream] exclude_input_in_output = True echo = getattr(request, "echo", None) - if echo: + if echo is not None: exclude_input_in_output = not echo - inputs["exclude_input_in_output"] = [exclude_input_in_output] - return model.create_request(inputs=inputs, parameters=sampling_parameters) + + inputs["text_input"] = [prompt] + inputs["stream"] = np.bool_([request.stream]) + inputs["exclude_input_in_output"] = np.bool_([exclude_input_in_output]) + # Pass sampling_parameters as serialized JSON string input to support List + # fields like 'stop' that aren't supported by TRITONSERVER_Parameters yet. + inputs["sampling_parameters"] = [sampling_parameters] + return model.create_request(inputs=inputs) def _create_trtllm_inference_request( @@ -59,7 +63,7 @@ def _create_trtllm_inference_request( ): inputs = {} inputs["text_input"] = [[prompt]] - inputs["stream"] = [[request.stream]] + inputs["stream"] = np.bool_([[request.stream]]) if request.max_tokens: inputs["max_tokens"] = np.int32([[request.max_tokens]]) if request.stop: @@ -77,6 +81,8 @@ def _create_trtllm_inference_request( inputs["random_seed"] = np.uint64([[request.seed]]) if request.temperature is not None: inputs["temperature"] = np.float32([[request.temperature]]) + # NOTE: TRT-LLM doesn't currently support runtime changes of 'echo' and it + # is configured at model load time, so we don't handle it here for now. return model.create_request(inputs=inputs) diff --git a/python/openai/openai_frontend/schemas/openai.py b/python/openai/openai_frontend/schemas/openai.py index b5e4381f7c..a95a645518 100644 --- a/python/openai/openai_frontend/schemas/openai.py +++ b/python/openai/openai_frontend/schemas/openai.py @@ -806,6 +806,7 @@ class CreateChatCompletionRequest(BaseModel): None, description="An integer between 0 and 20 specifying the number of most likely tokens to return at each token position, each with an associated log probability. `logprobs` must be set to `true` if this parameter is used.", ) + # TODO: Consider new max_completion_tokens field in the future: https://platform.openai.com/docs/api-reference/chat/create#chat-create-max_completion_tokens max_tokens: Optional[conint(ge=0)] = Field( 16, description="The maximum number of [tokens](/tokenizer) that can be generated in the chat completion.\n\nThe total length of input tokens and generated tokens is limited by the model's context length. [Example Python code](https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken) for counting tokens.\n", diff --git a/python/openai/tests/test_chat_completions.py b/python/openai/tests/test_chat_completions.py index 2c5d220c75..347f5939f1 100644 --- a/python/openai/tests/test_chat_completions.py +++ b/python/openai/tests/test_chat_completions.py @@ -95,6 +95,11 @@ def test_chat_completions_system_prompt_only(self, client, model: str): ("frequency_penalty", 0.5), ("presence_penalty", 0.2), ("n", 1), + # Single stop word as a string + ("stop", "."), + # List of stop words + ("stop", []), + ("stop", [".", ","]), # logprobs is a boolean for chat completions ("logprobs", True), ("logit_bias", {"0": 0}), @@ -156,8 +161,9 @@ def test_chat_completions_invalid_sampling_parameters( param_key: param_value, }, ) - print("Response:", response.json()) + + # Assert schema validation error assert response.status_code == 422 # Simple tests to verify max_tokens roughly behaves as expected @@ -568,4 +574,11 @@ def test_chat_completions_invalid_chat_tokenizer( ) assert response.status_code == 400 - assert "cannot use apply_chat_template()" in response.json()["detail"].lower() + # Error may vary based on transformers version + expected_errors = [ + "cannot use apply_chat_template()", + "cannot use chat template", + ] + assert any( + error in response.json()["detail"].lower() for error in expected_errors + ) diff --git a/python/openai/tests/test_openai_client.py b/python/openai/tests/test_openai_client.py index f1c566cbf2..6f1b456ab4 100644 --- a/python/openai/tests/test_openai_client.py +++ b/python/openai/tests/test_openai_client.py @@ -176,17 +176,39 @@ async def test_completion_streaming( assert chunk.choices[0].finish_reason == stop_reason assert "".join(chunks) == output + @pytest.mark.parametrize( + "sampling_parameter_dict", + [ + {}, + # Verify that stop words work with streaming outputs + {"stop": "is"}, + {"stop": ["is"]}, + {"stop": ["is", ".", ","]}, + ], + ) @pytest.mark.asyncio async def test_chat_streaming( - self, client: openai.AsyncOpenAI, model: str, messages: List[dict] + self, + client: openai.AsyncOpenAI, + model: str, + messages: List[dict], + sampling_parameter_dict: dict, ): + # Fixed seed and temperature for comparing reproducible responses + seed = 0 + temperature = 0.0 + # Generate enough tokens to easily identify stop words are working. + max_tokens = 64 + # Test single chat completion for comparison chat_completion = await client.chat.completions.create( model=model, messages=messages, - max_tokens=10, - temperature=0.0, + max_tokens=max_tokens, + temperature=temperature, + seed=seed, stream=False, + **sampling_parameter_dict, ) output = chat_completion.choices[0].message.content stop_reason = chat_completion.choices[0].finish_reason @@ -195,9 +217,11 @@ async def test_chat_streaming( stream = await client.chat.completions.create( model=model, messages=messages, - max_tokens=10, - temperature=0.0, + max_tokens=max_tokens, + temperature=temperature, + seed=seed, stream=True, + **sampling_parameter_dict, ) chunks = [] finish_reason_count = 0 @@ -213,7 +237,12 @@ async def test_chat_streaming( # finish reason should only return in last block assert finish_reason_count == 1 assert chunk.choices[0].finish_reason == stop_reason - assert "".join(chunks) == output + + # Assert that streaming actually returned multiple responses + # and that it is equivalent to the non-streamed output + assert len(chunks) > 1 + streamed_output = "".join(chunks) + assert streamed_output == output @pytest.mark.skip(reason="Not Implemented Yet") @pytest.mark.asyncio From 579ad63d3bad92942b054b46a0ad44827469f8b7 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Tue, 8 Oct 2024 17:03:32 -0700 Subject: [PATCH 70/80] Review feedback: remove examples/ and docker/ folders, update README to not use dockerfiles --- python/openai/README.md | 31 +++++++---------- python/openai/docker/Dockerfile.tensorrtllm | 33 ------------------ python/openai/docker/Dockerfile.vllm | 37 --------------------- python/openai/docker/requirements.txt | 7 ---- python/openai/examples/chat.sh | 9 ----- python/openai/examples/genai_perf.sh | 12 ------- python/openai/examples/models.sh | 3 -- python/openai/examples/openai_client.py | 28 ---------------- python/openai/examples/streaming_chat.sh | 19 ----------- python/openai/examples/test_overrides.sh | 6 ---- 10 files changed, 12 insertions(+), 173 deletions(-) delete mode 100644 python/openai/docker/Dockerfile.tensorrtllm delete mode 100644 python/openai/docker/Dockerfile.vllm delete mode 100644 python/openai/docker/requirements.txt delete mode 100755 python/openai/examples/chat.sh delete mode 100755 python/openai/examples/genai_perf.sh delete mode 100755 python/openai/examples/models.sh delete mode 100755 python/openai/examples/openai_client.py delete mode 100755 python/openai/examples/streaming_chat.sh delete mode 100755 python/openai/examples/test_overrides.sh diff --git a/python/openai/README.md b/python/openai/README.md index 8a59e23bb0..03b1192e39 100644 --- a/python/openai/README.md +++ b/python/openai/README.md @@ -16,17 +16,16 @@ access gated models, make sure this is set in your local environment if needed. ```bash -docker build -t tritonserver-openai-vllm -f docker/Dockerfile.vllm . - docker run -it --net=host --gpus all --rm \ -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \ -e HF_TOKEN \ - tritonserver-openai-vllm + nvcr.io/nvidia/tritonserver:24.08-vllm-python-py3 ``` 2. Launch the OpenAI-compatible Triton Inference Server: ```bash -cd openai/ +git clone https://github.com/triton-inference-server/server.git +cd server/python/openai/ # NOTE: Adjust the --tokenizer based on the model being used python3 openai_frontend/main.py --model-repository tests/vllm_models --tokenizer meta-llama/Meta-Llama-3.1-8B-Instruct @@ -95,40 +94,31 @@ print(completion.choices[0].message.content) 7. Run tests (NOTE: The server should not be running, the tests will handle starting/stopping the server as necessary): ```bash +cd server/python/openai/ pytest -v tests/ ``` -8. For a list of examples, see the `examples/` folder. - ## TensorRT-LLM -**NOTE**: The workflow for preparing TRT-LLM engines, model repository, etc. in order to -load and test is not fleshed out in the README here yet. You can try using the Triton CLI -or follow existing TRT-LLM backend examples to prepare a model repository, and point -at the model repository accordingly when following the examples. - -0. Prepare your model repository for a TensorRT-LLM model, build the engine, etc. +0. Prepare your model repository for serving a TensorRT-LLM model: + https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#quick-start 1. Build and launch the container: - - Mounts the openai source files to `/workspace` for simplicity, later on these will be shipped in the container. - Mounts the `~/.huggingface/cache` for re-use of downloaded models across runs, containers, etc. - Sets the [`HF_TOKEN`](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hftoken) environment variable to access gated models, make sure this is set in your local environment if needed. ```bash -docker build -t tritonserver-openai-tensorrtllm -f docker/Dockerfile.tensorrtllm ./docker - docker run -it --net=host --gpus all --rm \ - -v ${PWD}:/workspace \ -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \ -e HF_TOKEN \ - -w /workspace \ - tritonserver-openai-tensorrtllm + nvcr.io/nvidia/tritonserver:24.08-trtllm-python-py3 ``` 2. Launch the OpenAI server: ```bash -cd openai/ +git clone https://github.com/triton-inference-server/server.git +cd server/python/openai/ # NOTE: Adjust the --tokenizer based on the model being used python3 openai_frontend/main.py --model-repository tests/tensorrtllm_models --tokenizer meta-llama/Meta-Llama-3.1-8B-Instruct @@ -166,3 +156,6 @@ python3 openai_frontend/main.py \ See `python3 openai_frontend/main.py --help` for more information on the available arguments and default values. + +For more information on the `tritonfrontend` python bindings, see the docs +[here](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/tritonfrontend.md). diff --git a/python/openai/docker/Dockerfile.tensorrtllm b/python/openai/docker/Dockerfile.tensorrtllm deleted file mode 100644 index ef3bcca69c..0000000000 --- a/python/openai/docker/Dockerfile.tensorrtllm +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.08-trtllm-python-py3 -FROM ${BASE_IMAGE} - -RUN pip install /opt/tritonserver/python/*.whl - -COPY requirements.txt /tmp -RUN pip install -r /tmp/requirements.txt diff --git a/python/openai/docker/Dockerfile.vllm b/python/openai/docker/Dockerfile.vllm deleted file mode 100644 index 8ee6c04c6c..0000000000 --- a/python/openai/docker/Dockerfile.vllm +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.08-vllm-python-py3 -FROM ${BASE_IMAGE} - -RUN pip install /opt/tritonserver/python/*.whl - -# TODO: Update along with other folder/structure changes in review comments -WORKDIR /workspace -RUN git clone --single-branch -b rmccormick-openai https://github.com/triton-inference-server/server.git && \ - pip install -r server/python/openai/docker/requirements.txt && \ - mv server/python/openai/ . && \ - rm -r server diff --git a/python/openai/docker/requirements.txt b/python/openai/docker/requirements.txt deleted file mode 100644 index 24849af73a..0000000000 --- a/python/openai/docker/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -# FastAPI Application -fastapi==0.111.1 -openai==1.40.6 - -# Testing -pytest==8.1.1 -pytest-asyncio==0.23.8 diff --git a/python/openai/examples/chat.sh b/python/openai/examples/chat.sh deleted file mode 100755 index 5a7bb9b656..0000000000 --- a/python/openai/examples/chat.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash -# or "tensorrt_llm_bls" for TRT-LLM -MODEL=${1:-"llama-3.1-8b-instruct"} -curl -s http://localhost:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "'${MODEL}'", - "messages": [{"role": "user", "content": "Say this is a test!"}] - }' | jq diff --git a/python/openai/examples/genai_perf.sh b/python/openai/examples/genai_perf.sh deleted file mode 100755 index b1e3716fb6..0000000000 --- a/python/openai/examples/genai_perf.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash -MODEL=${1:-"llama-3.1-8b-instruct"} -genai-perf \ - --model ${MODEL} \ - --tokenizer meta-llama/Meta-Llama-3-8B-Instruct \ - --service-kind openai \ - --endpoint-type chat \ - --synthetic-input-tokens-mean 256 \ - --synthetic-input-tokens-stddev 0 \ - --output-tokens-mean 256 \ - --output-tokens-stddev 0 \ - --streaming diff --git a/python/openai/examples/models.sh b/python/openai/examples/models.sh deleted file mode 100755 index 944fbe07af..0000000000 --- a/python/openai/examples/models.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash -curl -s http://localhost:8000/v1/models \ - -H "Content-Type: application/json" | jq diff --git a/python/openai/examples/openai_client.py b/python/openai/examples/openai_client.py deleted file mode 100755 index 913aac44a0..0000000000 --- a/python/openai/examples/openai_client.py +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env python3 -import sys - -from openai import OpenAI - -# or "tensorrt_llm_bls" for TRT-LLM -model = "llama-3.1-8b-instruct" -if len(sys.argv) > 1: - model = sys.argv[1] - -client = OpenAI( - base_url="http://localhost:8000/v1", - api_key="EMPTY", -) - -completion = client.chat.completions.create( - model=model, - messages=[ - { - "role": "system", - "content": "You are a helpful assistant.", - }, - {"role": "user", "content": "What are LLMs?"}, - ], - max_tokens=256, -) - -print(completion.choices[0].message.content) diff --git a/python/openai/examples/streaming_chat.sh b/python/openai/examples/streaming_chat.sh deleted file mode 100755 index 6ace5434d2..0000000000 --- a/python/openai/examples/streaming_chat.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -# or "tensorrt_llm_bls" for TRT-LLM -MODEL=${1:-"llama-3.1-8b-instruct"} -curl -s -N http://localhost:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "'${MODEL}'", - "messages": [ - { - "role": "system", - "content": "You are a helpful assistant." - }, - { - "role": "user", - "content": "Hello!" - } - ], - "stream": true - }' diff --git a/python/openai/examples/test_overrides.sh b/python/openai/examples/test_overrides.sh deleted file mode 100755 index ff36ad2ea5..0000000000 --- a/python/openai/examples/test_overrides.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -export TEST_BACKEND="vllm" -export TEST_MODEL=llama-3.1-8b-instruct -export TEST_MODEL_REPOSITORY="${PWD}/tests/vllm_models/" -export TEST_TOKENIZER="meta-llama/Meta-Llama-3.1-8B-Instruct" -python3 -m pytest -s -v tests/ From 815eebeae651056ccb6be1d49ed852a8496858a4 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Tue, 8 Oct 2024 17:20:11 -0700 Subject: [PATCH 71/80] Add a few FIXMEs for follow-up --- python/openai/openai_frontend/engine/utils/triton.py | 2 +- python/openai/openai_frontend/main.py | 1 + qa/L0_openai/test.sh | 3 +++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/python/openai/openai_frontend/engine/utils/triton.py b/python/openai/openai_frontend/engine/utils/triton.py index e55ace5a45..77faa9a958 100644 --- a/python/openai/openai_frontend/engine/utils/triton.py +++ b/python/openai/openai_frontend/engine/utils/triton.py @@ -81,7 +81,7 @@ def _create_trtllm_inference_request( inputs["random_seed"] = np.uint64([[request.seed]]) if request.temperature is not None: inputs["temperature"] = np.float32([[request.temperature]]) - # NOTE: TRT-LLM doesn't currently support runtime changes of 'echo' and it + # FIXME: TRT-LLM doesn't currently support runtime changes of 'echo' and it # is configured at model load time, so we don't handle it here for now. return model.create_request(inputs=inputs) diff --git a/python/openai/openai_frontend/main.py b/python/openai/openai_frontend/main.py index cbd62790a1..29fc684354 100755 --- a/python/openai/openai_frontend/main.py +++ b/python/openai/openai_frontend/main.py @@ -73,6 +73,7 @@ def start_kserve_frontends(server, args): grpc_service.start() except ModuleNotFoundError: + # FIXME: Raise error instead of warning if kserve frontends are opt-in print( "[WARNING] The 'tritonfrontend' package was not found. " "KServe frontends won't be available through this application without it. " diff --git a/qa/L0_openai/test.sh b/qa/L0_openai/test.sh index d7f71cf1fc..0a4a8fd837 100755 --- a/qa/L0_openai/test.sh +++ b/qa/L0_openai/test.sh @@ -3,6 +3,9 @@ ### Helpers ### function install_deps() { + # FIXME: Once the test script and code is in-place, we can clone/copy the + # tests rather than expecting them to be baked into the container here, for + # better portability and easier test setup. pushd openai/docker pip install /opt/tritonserver/python/triton*.whl pip install -r requirements.txt From 8f92734a3d2c44cd5a789a8e715a1090ff6bea31 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Tue, 8 Oct 2024 17:41:28 -0700 Subject: [PATCH 72/80] Add requirements.txt back in, fix test and docs accordingly --- python/openai/README.md | 34 +++++++++++++++++++++++++--------- python/openai/requirements.txt | 7 +++++++ qa/L0_openai/test.sh | 9 +++++---- 3 files changed, 37 insertions(+), 13 deletions(-) create mode 100644 python/openai/requirements.txt diff --git a/python/openai/README.md b/python/openai/README.md index 03b1192e39..61ffc3f3dc 100644 --- a/python/openai/README.md +++ b/python/openai/README.md @@ -10,7 +10,7 @@ ## VLLM -1. Build and launch the container: +1. Launch the container and install dependencies: - Mounts the `~/.huggingface/cache` for re-use of downloaded models across runs, containers, etc. - Sets the [`HF_TOKEN`](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hftoken) environment variable to access gated models, make sure this is set in your local environment if needed. @@ -22,16 +22,24 @@ docker run -it --net=host --gpus all --rm \ nvcr.io/nvidia/tritonserver:24.08-vllm-python-py3 ``` -2. Launch the OpenAI-compatible Triton Inference Server: +2. Install dependencies inside the container: ```bash +# Install python bindings for tritonserver and tritonfrontend +pip install /opt/tritonserver/python/triton*.whl + +# Install application/testing requirements git clone https://github.com/triton-inference-server/server.git cd server/python/openai/ +pip install -r requirements.txt +``` +3. Launch the OpenAI-compatible Triton Inference Server: +```bash # NOTE: Adjust the --tokenizer based on the model being used python3 openai_frontend/main.py --model-repository tests/vllm_models --tokenizer meta-llama/Meta-Llama-3.1-8B-Instruct ``` -3. Send a `/v1/chat/completions` request: +4. Send a `/v1/chat/completions` request: - Note the use of `jq` is optional, but provides a nicely formatted output for JSON responses. ```bash MODEL="llama-3.1-8b-instruct" @@ -41,7 +49,7 @@ curl -s http://localhost:9000/v1/chat/completions -H 'Content-Type: application/ }' | jq ``` -4. Send a `/v1/completions` request: +5. Send a `/v1/completions` request: - Note the use of `jq` is optional, but provides a nicely formatted output for JSON responses. ```bash MODEL="llama-3.1-8b-instruct" @@ -51,7 +59,7 @@ curl -s http://localhost:9000/v1/completions -H 'Content-Type: application/json' }' | jq ``` -5. Benchmark with `genai-perf`: +6. Benchmark with `genai-perf`: ```bash MODEL="llama-3.1-8b-instruct" TOKENIZER="meta-llama/Meta-Llama-3.1-8B-Instruct" @@ -67,7 +75,7 @@ genai-perf \ --streaming ``` -6. Use the OpenAI python client directly: +7. Use the OpenAI python client directly: ```python from openai import OpenAI @@ -92,7 +100,7 @@ completion = client.chat.completions.create( print(completion.choices[0].message.content) ``` -7. Run tests (NOTE: The server should not be running, the tests will handle starting/stopping the server as necessary): +8. Run tests (NOTE: The server should not be running, the tests will handle starting/stopping the server as necessary): ```bash cd server/python/openai/ pytest -v tests/ @@ -103,7 +111,7 @@ pytest -v tests/ 0. Prepare your model repository for serving a TensorRT-LLM model: https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#quick-start -1. Build and launch the container: +1. Launch the container: - Mounts the `~/.huggingface/cache` for re-use of downloaded models across runs, containers, etc. - Sets the [`HF_TOKEN`](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hftoken) environment variable to access gated models, make sure this is set in your local environment if needed. @@ -115,11 +123,19 @@ docker run -it --net=host --gpus all --rm \ nvcr.io/nvidia/tritonserver:24.08-trtllm-python-py3 ``` -2. Launch the OpenAI server: +2. Install dependencies inside the container: ```bash +# Install python bindings for tritonserver and tritonfrontend +pip install /opt/tritonserver/python/triton*.whl + +# Install application/testing requirements git clone https://github.com/triton-inference-server/server.git cd server/python/openai/ +pip install -r requirements.txt +``` +2. Launch the OpenAI server: +```bash # NOTE: Adjust the --tokenizer based on the model being used python3 openai_frontend/main.py --model-repository tests/tensorrtllm_models --tokenizer meta-llama/Meta-Llama-3.1-8B-Instruct ``` diff --git a/python/openai/requirements.txt b/python/openai/requirements.txt new file mode 100644 index 0000000000..24849af73a --- /dev/null +++ b/python/openai/requirements.txt @@ -0,0 +1,7 @@ +# FastAPI Application +fastapi==0.111.1 +openai==1.40.6 + +# Testing +pytest==8.1.1 +pytest-asyncio==0.23.8 diff --git a/qa/L0_openai/test.sh b/qa/L0_openai/test.sh index 0a4a8fd837..5d08bd0a57 100755 --- a/qa/L0_openai/test.sh +++ b/qa/L0_openai/test.sh @@ -3,12 +3,13 @@ ### Helpers ### function install_deps() { - # FIXME: Once the test script and code is in-place, we can clone/copy the - # tests rather than expecting them to be baked into the container here, for - # better portability and easier test setup. - pushd openai/docker + # Install python bindings for tritonserver and tritonfrontend pip install /opt/tritonserver/python/triton*.whl + + # Install application/testing requirements + pushd openai/ pip install -r requirements.txt + if [ "${IMAGE_KIND}" == "TRTLLM" ]; then prepare_tensorrtllm else From 5c0b2e63d25bc6bed07cfda0b39adc24984c7360 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Tue, 8 Oct 2024 17:49:13 -0700 Subject: [PATCH 73/80] Fix TRT-LLM model repo test path --- qa/L0_openai/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qa/L0_openai/test.sh b/qa/L0_openai/test.sh index 5d08bd0a57..506bd352bb 100755 --- a/qa/L0_openai/test.sh +++ b/qa/L0_openai/test.sh @@ -24,7 +24,7 @@ function prepare_vllm() { function prepare_tensorrtllm() { MODEL="llama-3-8b-instruct" - MODEL_REPO="../tests/tensorrtllm_models" + MODEL_REPO="tests/tensorrtllm_models" rm -rf ${MODEL_REPO} # FIXME: This will require an upgrade each release to match the TRT-LLM version From 44b22827ee794c50709050ce9e257229b52caa7f Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Wed, 9 Oct 2024 16:28:07 -0700 Subject: [PATCH 74/80] Explicitly return error on unknown fields not defined in schema, exclude more openai fields explicitly from sampling parameters input to vllm backend --- .../openai_frontend/engine/utils/triton.py | 22 ++++++++++++++++++- .../openai/openai_frontend/schemas/openai.py | 6 +++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/python/openai/openai_frontend/engine/utils/triton.py b/python/openai/openai_frontend/engine/utils/triton.py index 77faa9a958..2ec8cce7d5 100644 --- a/python/openai/openai_frontend/engine/utils/triton.py +++ b/python/openai/openai_frontend/engine/utils/triton.py @@ -36,7 +36,26 @@ def _create_vllm_inference_request( model, prompt, request: CreateChatCompletionRequest | CreateCompletionRequest ): inputs = {} - excludes = {"model", "stream", "messages", "prompt", "echo"} + # Exclude non-sampling parameters so they aren't passed to vLLM + excludes = { + "model", + "stream", + "messages", + "prompt", + "echo", + "store", + "metadata", + "response_format", + "service_tier", + "stream_options", + "tools", + "tool_choice", + "parallel_tool_calls", + "user", + "function_call", + "functions", + "suffix", + } # NOTE: The exclude_none is important, as internals may not support # values of NoneType at this time. @@ -44,6 +63,7 @@ def _create_vllm_inference_request( exclude=excludes, exclude_none=True, ) + exclude_input_in_output = True echo = getattr(request, "echo", None) if echo is not None: diff --git a/python/openai/openai_frontend/schemas/openai.py b/python/openai/openai_frontend/schemas/openai.py index a95a645518..a012f277f1 100644 --- a/python/openai/openai_frontend/schemas/openai.py +++ b/python/openai/openai_frontend/schemas/openai.py @@ -42,6 +42,9 @@ class PromptItem(RootModel): class CreateCompletionRequest(BaseModel): + # Explicitly return errors for unknown fields. + model_config: ConfigDict = ConfigDict(extra="forbid") + model: Union[str, Model1] = Field( ..., description="ID of the model to use. You can use the [List models](/docs/api-reference/models/list) API to see all of your available models, or see our [Model overview](/docs/models/overview) for descriptions of them.\n", @@ -776,6 +779,9 @@ def content(self): class CreateChatCompletionRequest(BaseModel): + # Explicitly return errors for unknown fields. + model_config: ConfigDict = ConfigDict(extra="forbid") + messages: List[ChatCompletionRequestMessage] = Field( ..., description="A list of messages comprising the conversation so far. [Example Python code](https://cookbook.openai.com/examples/how_to_format_inputs_to_chatgpt_models).", From 49162be0e678ff1059d328d019f2f1eebcaa03db Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Thu, 10 Oct 2024 11:50:36 -0700 Subject: [PATCH 75/80] Add missing copyright headers --- python/openai/README.md | 27 +++++++++++++++++ .../openai/openai_frontend/engine/__init__.py | 25 ++++++++++++++++ .../openai_frontend/engine/utils/tokenizer.py | 30 +++++++++++++++++++ .../openai_frontend/frontend/__init__.py | 25 ++++++++++++++++ .../openai/openai_frontend/schemas/openai.py | 26 ++++++++++++++++ 5 files changed, 133 insertions(+) diff --git a/python/openai/README.md b/python/openai/README.md index 61ffc3f3dc..1a0b1abd5f 100644 --- a/python/openai/README.md +++ b/python/openai/README.md @@ -1,3 +1,30 @@ + # OpenAI-Compatible Frontend for Triton Inference Server ## Pre-requisites diff --git a/python/openai/openai_frontend/engine/__init__.py b/python/openai/openai_frontend/engine/__init__.py index e69de29bb2..77855ae979 100644 --- a/python/openai/openai_frontend/engine/__init__.py +++ b/python/openai/openai_frontend/engine/__init__.py @@ -0,0 +1,25 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/python/openai/openai_frontend/engine/utils/tokenizer.py b/python/openai/openai_frontend/engine/utils/tokenizer.py index a60783a5f9..413be66f6b 100644 --- a/python/openai/openai_frontend/engine/utils/tokenizer.py +++ b/python/openai/openai_frontend/engine/utils/tokenizer.py @@ -1,3 +1,33 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Adapted from +# https://github.com/triton-inference-server/server/blob/rmccormick-openai/python/openai/openai_frontend/engine/utils/tokenizer.py +# Copyright 2024 The vLLM team. + from typing import Optional, Union from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast diff --git a/python/openai/openai_frontend/frontend/__init__.py b/python/openai/openai_frontend/frontend/__init__.py index e69de29bb2..77855ae979 100644 --- a/python/openai/openai_frontend/frontend/__init__.py +++ b/python/openai/openai_frontend/frontend/__init__.py @@ -0,0 +1,25 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/python/openai/openai_frontend/schemas/openai.py b/python/openai/openai_frontend/schemas/openai.py index a012f277f1..9a562729a7 100644 --- a/python/openai/openai_frontend/schemas/openai.py +++ b/python/openai/openai_frontend/schemas/openai.py @@ -1,3 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + # generated by fastapi-codegen: # filename: api-spec/openai_trimmed.yml # timestamp: 2024-05-05T21:52:36+00:00 From fe45d39c5e26ed3f43d71dbc5c4d558dba8142c5 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Thu, 10 Oct 2024 11:59:03 -0700 Subject: [PATCH 76/80] Review feedback: split app and test requirements to 2 requirements files --- python/openai/README.md | 6 ++++-- python/openai/requirements-test.txt | 3 +++ python/openai/requirements.txt | 4 ---- qa/L0_openai/test.sh | 26 ++++++++++++++++++++++++++ 4 files changed, 33 insertions(+), 6 deletions(-) create mode 100644 python/openai/requirements-test.txt diff --git a/python/openai/README.md b/python/openai/README.md index 1a0b1abd5f..f6647da12a 100644 --- a/python/openai/README.md +++ b/python/openai/README.md @@ -54,7 +54,7 @@ docker run -it --net=host --gpus all --rm \ # Install python bindings for tritonserver and tritonfrontend pip install /opt/tritonserver/python/triton*.whl -# Install application/testing requirements +# Install application requirements git clone https://github.com/triton-inference-server/server.git cd server/python/openai/ pip install -r requirements.txt @@ -130,6 +130,8 @@ print(completion.choices[0].message.content) 8. Run tests (NOTE: The server should not be running, the tests will handle starting/stopping the server as necessary): ```bash cd server/python/openai/ +pip install -r requirements-test.txt + pytest -v tests/ ``` @@ -155,7 +157,7 @@ docker run -it --net=host --gpus all --rm \ # Install python bindings for tritonserver and tritonfrontend pip install /opt/tritonserver/python/triton*.whl -# Install application/testing requirements +# Install application requirements git clone https://github.com/triton-inference-server/server.git cd server/python/openai/ pip install -r requirements.txt diff --git a/python/openai/requirements-test.txt b/python/openai/requirements-test.txt new file mode 100644 index 0000000000..08c098811b --- /dev/null +++ b/python/openai/requirements-test.txt @@ -0,0 +1,3 @@ +# Testing +pytest==8.1.1 +pytest-asyncio==0.23.8 diff --git a/python/openai/requirements.txt b/python/openai/requirements.txt index 24849af73a..d87feaa6f2 100644 --- a/python/openai/requirements.txt +++ b/python/openai/requirements.txt @@ -1,7 +1,3 @@ # FastAPI Application fastapi==0.111.1 openai==1.40.6 - -# Testing -pytest==8.1.1 -pytest-asyncio==0.23.8 diff --git a/qa/L0_openai/test.sh b/qa/L0_openai/test.sh index 506bd352bb..c9ad4589e7 100755 --- a/qa/L0_openai/test.sh +++ b/qa/L0_openai/test.sh @@ -1,4 +1,29 @@ #!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ### Helpers ### @@ -9,6 +34,7 @@ function install_deps() { # Install application/testing requirements pushd openai/ pip install -r requirements.txt + pip install -r requirements-test.txt if [ "${IMAGE_KIND}" == "TRTLLM" ]; then prepare_tensorrtllm From 2261d1348068d34048a50b91a0d7c74c0674ebcb Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Thu, 10 Oct 2024 12:11:16 -0700 Subject: [PATCH 77/80] Fix whitespace pre-commit, remove auto 'git add' from copyright tool --- qa/L0_openai/test.sh | 4 ++-- tools/add_copyright.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/qa/L0_openai/test.sh b/qa/L0_openai/test.sh index c9ad4589e7..c910c204ac 100755 --- a/qa/L0_openai/test.sh +++ b/qa/L0_openai/test.sh @@ -1,6 +1,6 @@ #!/bin/bash # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# +# # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: @@ -12,7 +12,7 @@ # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. -# +# # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR diff --git a/tools/add_copyright.py b/tools/add_copyright.py index 34432bb0c6..7f523ad56f 100644 --- a/tools/add_copyright.py +++ b/tools/add_copyright.py @@ -257,7 +257,9 @@ def add_copyrights(paths): f"WARNING: No handler registered for file: {path}. Please add a new handler to {__file__}!" ) - subprocess.run(["git", "add"] + paths) + # Don't automatically 'git add' changes for now, make it more clear which + # files were changed and have ability to see 'git diff' on them. + # subprocess.run(["git", "add"] + paths) print(f"Processed copyright headers for {len(paths)} file(s).") From 2e2a1904b639d78354f37960df9bd18703b6a0cb Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Thu, 10 Oct 2024 12:27:30 -0700 Subject: [PATCH 78/80] Disable copyright pre-commit hook until fixed on GitHub Actions side --- .pre-commit-config.yaml | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 663a36d631..b11dc007bd 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -73,12 +73,13 @@ repos: - id: requirements-txt-fixer - id: trailing-whitespace -- repo: local - hooks: - - id: add-license - name: Add License - entry: python tools/add_copyright.py - language: python - stages: [pre-commit] - verbose: true - require_serial: true +# FIXME: Only run on changed files when triggered by GitHub Actions +#- repo: local +# hooks: +# - id: add-license +# name: Add License +# entry: python tools/add_copyright.py +# language: python +# stages: [pre-commit] +# verbose: true +# require_serial: true From cc8657df53c4528a86930104200bfff1f7955215 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Thu, 10 Oct 2024 12:27:46 -0700 Subject: [PATCH 79/80] Fix attribution for tokenizer util --- python/openai/openai_frontend/engine/utils/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/openai/openai_frontend/engine/utils/tokenizer.py b/python/openai/openai_frontend/engine/utils/tokenizer.py index 413be66f6b..982e553cea 100644 --- a/python/openai/openai_frontend/engine/utils/tokenizer.py +++ b/python/openai/openai_frontend/engine/utils/tokenizer.py @@ -25,7 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # Adapted from -# https://github.com/triton-inference-server/server/blob/rmccormick-openai/python/openai/openai_frontend/engine/utils/tokenizer.py +# https://github.com/vllm-project/vllm/blob/main/vllm/transformers_utils/tokenizer.py # Copyright 2024 The vLLM team. from typing import Optional, Union From fa9501e6f207d894d1ff61c57de916a3a0209ce7 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Thu, 10 Oct 2024 13:02:18 -0700 Subject: [PATCH 80/80] Fix copyright header on copyright tool, remove unused import --- tools/add_copyright.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/add_copyright.py b/tools/add_copyright.py index 7f523ad56f..cf6b2a8686 100644 --- a/tools/add_copyright.py +++ b/tools/add_copyright.py @@ -1,4 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: @@ -25,7 +26,6 @@ import argparse import os import re -import subprocess import sys from datetime import datetime from typing import Callable, Dict, Optional, Sequence