diff --git a/Dockerfile b/Dockerfile index 6a56a33cfe7ac..1f254c76fe5af 100644 --- a/Dockerfile +++ b/Dockerfile @@ -122,7 +122,7 @@ RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,ta FROM vllm-base AS vllm-openai # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ - pip install accelerate hf_transfer + pip install accelerate hf_transfer modelscope COPY --from=build /workspace/vllm/*.so /workspace/vllm/ COPY vllm vllm diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 95e54bd151850..96749b9327d7a 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -1,3 +1,10 @@ sphinx == 6.2.1 sphinx-book-theme == 1.0.1 sphinx-copybutton == 0.5.2 +myst-parser == 2.0.0 +sphinx-argparse + +# packages to install to build the documentation +pydantic +-f https://download.pytorch.org/whl/cpu +torch \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index 61d24e1612128..2ca0d642b7463 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -22,7 +22,7 @@ # -- Project information ----------------------------------------------------- project = 'vLLM' -copyright = '2023, vLLM Team' +copyright = '2024, vLLM Team' author = 'the vLLM Team' # -- General configuration --------------------------------------------------- @@ -37,6 +37,8 @@ "sphinx_copybutton", "sphinx.ext.autodoc", "sphinx.ext.autosummary", + "myst_parser", + "sphinxarg.ext", ] # Add any paths that contain templates here, relative to this directory. diff --git a/docs/source/dev/sampling_params.rst b/docs/source/dev/sampling_params.rst new file mode 100644 index 0000000000000..844859b3ec1f0 --- /dev/null +++ b/docs/source/dev/sampling_params.rst @@ -0,0 +1,4 @@ +Sampling Params +=============== + +.. automodule:: vllm.sampling_params.SamplingParams \ No newline at end of file diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst index 5d9fdf4056709..3d736bf7120ec 100644 --- a/docs/source/getting_started/amd-installation.rst +++ b/docs/source/getting_started/amd-installation.rst @@ -100,7 +100,7 @@ You can build and install vLLM from source: Build a docker image from `Dockerfile.rocm`, and launch a docker container. -The `Dokerfile.rocm` is designed to support both ROCm 5.7 and ROCm 6.0 and later versions. It provides flexibility to customize the build of docker image using the following arguments: +The `Dockerfile.rocm` is designed to support both ROCm 5.7 and ROCm 6.0 and later versions. It provides flexibility to customize the build of docker image using the following arguments: * `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image. We have tested ROCm 5.7 and ROCm 6.0. The default is `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1` * `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942` diff --git a/docs/source/getting_started/neuron-installation.rst b/docs/source/getting_started/neuron-installation.rst index 0aff1037d8a29..62bf779c339d5 100644 --- a/docs/source/getting_started/neuron-installation.rst +++ b/docs/source/getting_started/neuron-installation.rst @@ -128,6 +128,7 @@ Once neuronx-cc and transformers-neuronx packages are installed, we will be able .. code-block:: console + $ git clone https://github.com/vllm-project/vllm.git $ cd vllm $ pip install -U -r requirements-neuron.txt $ pip install . diff --git a/docs/source/index.rst b/docs/source/index.rst index 65bfbbabf8be1..72081588b1bcf 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -69,14 +69,11 @@ Documentation :maxdepth: 1 :caption: Serving - serving/distributed_serving - serving/run_on_sky - serving/deploying_with_kserve - serving/deploying_with_triton - serving/deploying_with_bentoml + serving/openai_compatible_server serving/deploying_with_docker - serving/serving_with_langchain + serving/distributed_serving serving/metrics + serving/integrations .. toctree:: :maxdepth: 1 @@ -98,6 +95,7 @@ Documentation :maxdepth: 2 :caption: Developer Documentation + dev/sampling_params dev/engine/engine_index dev/kernel/paged_attention diff --git a/docs/source/models/lora.rst b/docs/source/models/lora.rst index f05fafe9f8279..2278640481a91 100644 --- a/docs/source/models/lora.rst +++ b/docs/source/models/lora.rst @@ -90,7 +90,7 @@ Requests can specify the LoRA adapter as if it were any other model via the ``mo processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other LoRA adapter requests if they were provided and ``max_loras`` is set high enough). -The following is an example request +The following is an example request .. code-block:: bash diff --git a/docs/source/serving/integrations.rst b/docs/source/serving/integrations.rst new file mode 100644 index 0000000000000..93872397913e3 --- /dev/null +++ b/docs/source/serving/integrations.rst @@ -0,0 +1,11 @@ +Integrations +------------ + +.. toctree:: + :maxdepth: 1 + + run_on_sky + deploying_with_kserve + deploying_with_triton + deploying_with_bentoml + serving_with_langchain diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md new file mode 100644 index 0000000000000..032fe5d03bd52 --- /dev/null +++ b/docs/source/serving/openai_compatible_server.md @@ -0,0 +1,114 @@ +# OpenAI Compatible Server + +vLLM provides an HTTP server that implements OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API. + +You can start the server using Python, or using [Docker](deploying_with_docker.rst): +```bash +python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-hf --dtype float32 --api-key token-abc123 +``` + +To call the server, you can use the official OpenAI Python client library, or any other HTTP client. +```python +from openai import OpenAI +client = OpenAI( + base_url="http://localhost:8000/v1", + api_key="token-abc123", +) + +completion = client.chat.completions.create( + model="meta-llama/Llama-2-7b-hf", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hello!"} + ] +) + +print(completion.choices[0].message) +``` + +## API Reference +Please see the [OpenAI API Reference](https://platform.openai.com/docs/api-reference) for more information on the API. We support all parameters except: +- Chat: `tools`, and `tool_choice`. +- Completions: `suffix`. + +## Extra Parameters +vLLM supports a set of parameters that are not part of the OpenAI API. +In order to use them, you can pass them as extra parameters in the OpenAI client. +Or directly merge them into the JSON payload if you are using HTTP call directly. + +```python +completion = client.chat.completions.create( + model="meta-llama/Llama-2-7b-hf", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} + ], + extra_body={ + "guided_choice": ["positive", "negative"] + } +) +``` + +### Extra Parameters for Chat API +The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. + +```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-chat-completion-sampling-params +:end-before: end-chat-completion-sampling-params +``` + +The following extra parameters are supported: + +```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-chat-completion-extra-params +:end-before: end-chat-completion-extra-params +``` + +### Extra Parameters for Completions API +The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. + +```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-completion-sampling-params +:end-before: end-completion-sampling-params +``` + +The following extra parameters are supported: + +```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-completion-extra-params +:end-before: end-completion-extra-params +``` + +## Chat Template + +In order for the language model to support chat protocol, vLLM requires the model to include +a chat template in its tokenizer configuration. The chat template is a Jinja2 template that +specifies how are roles, messages, and other chat-specific tokens are encoded in the input. + +An example chat template for `meta-llama/Llama-2-7b-chat-hf` can be found [here](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/09bd0f49e16738cdfaa6e615203e126038736eb0/tokenizer_config.json#L12) + +Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those model, +you can manually specify their chat template in the `--chat-template` parameter with the file path to the chat +template, or the template in string form. Without a chat template, the server will not be able to process chat +and all chat requests will error. + +```bash +python -m vllm.entrypoints.openai.api_server \ + --model ... \ + --chat-template ./path-to-chat-template.jinja +``` + +vLLM community provides a set of chat templates for popular models. You can find them in the examples +directory [here](https://github.com/vllm-project/vllm/tree/main/examples/) + +## Command line arguments for the server + +```{argparse} +:module: vllm.entrypoints.openai.cli_args +:func: make_arg_parser +:prog: vllm-openai-server +``` \ No newline at end of file diff --git a/vllm/config.py b/vllm/config.py index f792e89095246..51ae66e2375ab 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -577,12 +577,12 @@ class DeviceConfig: def __init__(self, device: str = "auto") -> None: if device == "auto": # Automated device type detection - if torch.cuda.is_available(): - self.device_type = "cuda" - elif is_neuron(): + if is_neuron(): self.device_type = "neuron" else: - raise RuntimeError("No supported device detected.") + # We don't call torch.cuda.is_available() here to + # avoid initializing CUDA before workers are forked + self.device_type = "cuda" else: # Device type is assigned explicitly self.device_type = device diff --git a/vllm/engine/ray_utils.py b/vllm/engine/ray_utils.py index 742f3dc575190..27414f085b45a 100644 --- a/vllm/engine/ray_utils.py +++ b/vllm/engine/ray_utils.py @@ -33,8 +33,17 @@ def __getattr__(self, name): return getattr(self.worker, name) def execute_method(self, method, *args, **kwargs): - executor = getattr(self, method) - return executor(*args, **kwargs) + try: + executor = getattr(self, method) + return executor(*args, **kwargs) + except Exception as e: + # exceptions in ray worker may cause deadlock + # see https://github.com/vllm-project/vllm/issues/3455 + # print the error and inform the user to solve the error + msg = (f"Error executing method {method}. " + "This might cause deadlock in distributed execution.") + logger.exception(msg) + raise e def get_node_ip(self) -> str: return get_ip() diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index e0626ca4e9da1..a0685a4d38fbe 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1,11 +1,8 @@ -import argparse import asyncio -import json from contextlib import asynccontextmanager import os import importlib import inspect -import ssl from prometheus_client import make_asgi_app import fastapi @@ -23,9 +20,9 @@ ChatCompletionRequest, ErrorResponse) from vllm.logger import init_logger +from vllm.entrypoints.openai.cli_args import make_arg_parser from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion -from vllm.entrypoints.openai.serving_engine import LoRA TIMEOUT_KEEP_ALIVE = 5 # seconds @@ -51,109 +48,8 @@ async def _force_log(): app = fastapi.FastAPI(lifespan=lifespan) -class LoRAParserAction(argparse.Action): - - def __call__(self, parser, namespace, values, option_string=None): - lora_list = [] - for item in values: - name, path = item.split('=') - lora_list.append(LoRA(name, path)) - setattr(namespace, self.dest, lora_list) - - def parse_args(): - parser = argparse.ArgumentParser( - description="vLLM OpenAI-Compatible RESTful API server.") - parser.add_argument("--host", type=str, default=None, help="host name") - parser.add_argument("--port", type=int, default=8000, help="port number") - parser.add_argument( - "--uvicorn-log-level", - type=str, - default="info", - choices=['debug', 'info', 'warning', 'error', 'critical', 'trace'], - help="log level for uvicorn") - parser.add_argument("--allow-credentials", - action="store_true", - help="allow credentials") - parser.add_argument("--allowed-origins", - type=json.loads, - default=["*"], - help="allowed origins") - parser.add_argument("--allowed-methods", - type=json.loads, - default=["*"], - help="allowed methods") - parser.add_argument("--allowed-headers", - type=json.loads, - default=["*"], - help="allowed headers") - parser.add_argument("--api-key", - type=str, - default=None, - help="If provided, the server will require this key " - "to be presented in the header.") - parser.add_argument("--served-model-name", - type=str, - default=None, - help="The model name used in the API. If not " - "specified, the model name will be the same as " - "the huggingface name.") - parser.add_argument( - "--lora-modules", - type=str, - default=None, - nargs='+', - action=LoRAParserAction, - help="LoRA module configurations in the format name=path. " - "Multiple modules can be specified.") - parser.add_argument("--chat-template", - type=str, - default=None, - help="The file path to the chat template, " - "or the template in single-line form " - "for the specified model") - parser.add_argument("--response-role", - type=str, - default="assistant", - help="The role name to return if " - "`request.add_generation_prompt=true`.") - parser.add_argument("--ssl-keyfile", - type=str, - default=None, - help="The file path to the SSL key file") - parser.add_argument("--ssl-certfile", - type=str, - default=None, - help="The file path to the SSL cert file") - parser.add_argument("--ssl-ca-certs", - type=str, - default=None, - help="The CA certificates file") - parser.add_argument( - "--ssl-cert-reqs", - type=int, - default=int(ssl.CERT_NONE), - help="Whether client certificate is required (see stdlib ssl module's)" - ) - parser.add_argument( - "--root-path", - type=str, - default=None, - help="FastAPI root_path when app is behind a path based routing proxy") - parser.add_argument( - "--middleware", - type=str, - action="append", - default=[], - help="Additional ASGI middleware to apply to the app. " - "We accept multiple --middleware arguments. " - "The value should be an import path. " - "If a function is provided, vLLM will add it to the server " - "using @app.middleware('http'). " - "If a class is provided, vLLM will add it to the server " - "using app.add_middleware(). ") - - parser = AsyncEngineArgs.add_cli_args(parser) + parser = make_arg_parser() return parser.parse_args() diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py new file mode 100644 index 0000000000000..cc71931b97955 --- /dev/null +++ b/vllm/entrypoints/openai/cli_args.py @@ -0,0 +1,118 @@ +""" +This file contains the command line arguments for the vLLM's +OpenAI-compatible server. It is kept in a separate file for documentation +purposes. +""" + +import argparse +import json +import ssl + +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.entrypoints.openai.serving_engine import LoRA + + +class LoRAParserAction(argparse.Action): + + def __call__(self, parser, namespace, values, option_string=None): + lora_list = [] + for item in values: + name, path = item.split('=') + lora_list.append(LoRA(name, path)) + setattr(namespace, self.dest, lora_list) + + +def make_arg_parser(): + parser = argparse.ArgumentParser( + description="vLLM OpenAI-Compatible RESTful API server.") + parser.add_argument("--host", type=str, default=None, help="host name") + parser.add_argument("--port", type=int, default=8000, help="port number") + parser.add_argument( + "--uvicorn-log-level", + type=str, + default="info", + choices=['debug', 'info', 'warning', 'error', 'critical', 'trace'], + help="log level for uvicorn") + parser.add_argument("--allow-credentials", + action="store_true", + help="allow credentials") + parser.add_argument("--allowed-origins", + type=json.loads, + default=["*"], + help="allowed origins") + parser.add_argument("--allowed-methods", + type=json.loads, + default=["*"], + help="allowed methods") + parser.add_argument("--allowed-headers", + type=json.loads, + default=["*"], + help="allowed headers") + parser.add_argument("--api-key", + type=str, + default=None, + help="If provided, the server will require this key " + "to be presented in the header.") + parser.add_argument("--served-model-name", + type=str, + default=None, + help="The model name used in the API. If not " + "specified, the model name will be the same as " + "the huggingface name.") + parser.add_argument( + "--lora-modules", + type=str, + default=None, + nargs='+', + action=LoRAParserAction, + help="LoRA module configurations in the format name=path. " + "Multiple modules can be specified.") + parser.add_argument("--chat-template", + type=str, + default=None, + help="The file path to the chat template, " + "or the template in single-line form " + "for the specified model") + parser.add_argument("--response-role", + type=str, + default="assistant", + help="The role name to return if " + "`request.add_generation_prompt=true`.") + parser.add_argument("--ssl-keyfile", + type=str, + default=None, + help="The file path to the SSL key file") + parser.add_argument("--ssl-certfile", + type=str, + default=None, + help="The file path to the SSL cert file") + parser.add_argument("--ssl-ca-certs", + type=str, + default=None, + help="The CA certificates file") + parser.add_argument( + "--ssl-cert-reqs", + type=int, + default=int(ssl.CERT_NONE), + help="Whether client certificate is required (see stdlib ssl module's)" + ) + parser.add_argument( + "--root-path", + type=str, + default=None, + help="FastAPI root_path when app is behind a path based routing proxy") + parser.add_argument( + "--middleware", + type=str, + action="append", + default=[], + help="Additional ASGI middleware to apply to the app. " + "We accept multiple --middleware arguments. " + "The value should be an import path. " + "If a function is provided, vLLM will add it to the server " + "using @app.middleware('http'). " + "If a class is provided, vLLM will add it to the server " + "using app.add_middleware(). ") + + parser = AsyncEngineArgs.add_cli_args(parser) + return parser diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 9421880411611..1f089d524fd03 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -61,41 +61,80 @@ class ResponseFormat(BaseModel): class ChatCompletionRequest(BaseModel): - model: str + # Ordered by official OpenAI API documentation + # https://platform.openai.com/docs/api-reference/chat/create messages: List[Dict[str, str]] - temperature: Optional[float] = 0.7 - top_p: Optional[float] = 1.0 - n: Optional[int] = 1 + model: str + frequency_penalty: Optional[float] = 0.0 + logit_bias: Optional[Dict[str, float]] = None + logprobs: Optional[bool] = False + top_logprobs: Optional[int] = None max_tokens: Optional[int] = None + n: Optional[int] = 1 + presence_penalty: Optional[float] = 0.0 + response_format: Optional[ResponseFormat] = None seed: Optional[int] = None stop: Optional[Union[str, List[str]]] = Field(default_factory=list) stream: Optional[bool] = False - logprobs: Optional[bool] = False - top_logprobs: Optional[int] = None - presence_penalty: Optional[float] = 0.0 - frequency_penalty: Optional[float] = 0.0 - logit_bias: Optional[Dict[str, float]] = None + temperature: Optional[float] = 0.7 + top_p: Optional[float] = 1.0 user: Optional[str] = None - # Additional parameters supported by vLLM + + # doc: begin-chat-completion-sampling-params best_of: Optional[int] = None - top_k: Optional[int] = -1 - ignore_eos: Optional[bool] = False use_beam_search: Optional[bool] = False + top_k: Optional[int] = -1 + min_p: Optional[float] = 0.0 + repetition_penalty: Optional[float] = 1.0 + length_penalty: Optional[float] = 1.0 early_stopping: Optional[bool] = False + ignore_eos: Optional[bool] = False stop_token_ids: Optional[List[int]] = Field(default_factory=list) skip_special_tokens: Optional[bool] = True spaces_between_special_tokens: Optional[bool] = True - add_generation_prompt: Optional[bool] = True - echo: Optional[bool] = False - repetition_penalty: Optional[float] = 1.0 - min_p: Optional[float] = 0.0 - include_stop_str_in_output: Optional[bool] = False - length_penalty: Optional[float] = 1.0 - guided_json: Optional[Union[str, dict, BaseModel]] = None - guided_regex: Optional[str] = None - guided_choice: Optional[List[str]] = None - guided_grammar: Optional[str] = None - response_format: Optional[ResponseFormat] = None + # doc: end-chat-completion-sampling-params + + # doc: begin-chat-completion-extra-params + echo: Optional[bool] = Field( + default=False, + description=( + "If true, the new message will be prepended with the last message " + "if they belong to the same role."), + ) + add_generation_prompt: Optional[bool] = Field( + default=True, + description= + ("If true, the generation prompt will be added to the chat template. " + "This is a parameter used by chat template in tokenizer config of the " + "model."), + ) + include_stop_str_in_output: Optional[bool] = Field( + default=False, + description=( + "Whether to include the stop string in the output. " + "This is only applied when the stop or stop_token_ids is set."), + ) + guided_json: Optional[Union[str, dict, BaseModel]] = Field( + default=None, + description=("If specified, the output will follow the JSON schema."), + ) + guided_regex: Optional[str] = Field( + default=None, + description=( + "If specified, the output will follow the regex pattern."), + ) + guided_choice: Optional[List[str]] = Field( + default=None, + description=( + "If specified, the output will be exactly one of the choices."), + ) + guided_grammar: Optional[str] = Field( + default=None, + description=( + "If specified, the output will follow the context free grammar."), + ) + + # doc: end-chat-completion-extra-params def to_sampling_params(self) -> SamplingParams: if self.logprobs and not self.top_logprobs: @@ -157,41 +196,74 @@ def check_guided_decoding_count(cls, data): class CompletionRequest(BaseModel): + # Ordered by official OpenAI API documentation + # https://platform.openai.com/docs/api-reference/completions/create model: str - # a string, array of strings, array of tokens, or array of token arrays prompt: Union[List[int], List[List[int]], str, List[str]] - suffix: Optional[str] = None - max_tokens: Optional[int] = 16 - temperature: Optional[float] = 1.0 - top_p: Optional[float] = 1.0 - n: Optional[int] = 1 - stream: Optional[bool] = False - logprobs: Optional[int] = None + best_of: Optional[int] = None echo: Optional[bool] = False - stop: Optional[Union[str, List[str]]] = Field(default_factory=list) - seed: Optional[int] = None - presence_penalty: Optional[float] = 0.0 frequency_penalty: Optional[float] = 0.0 - best_of: Optional[int] = None logit_bias: Optional[Dict[str, float]] = None + logprobs: Optional[int] = None + max_tokens: Optional[int] = 16 + n: Optional[int] = 1 + presence_penalty: Optional[float] = 0.0 + seed: Optional[int] = None + stop: Optional[Union[str, List[str]]] = Field(default_factory=list) + stream: Optional[bool] = False + suffix: Optional[str] = None + temperature: Optional[float] = 1.0 + top_p: Optional[float] = 1.0 user: Optional[str] = None - # Additional parameters supported by vLLM - top_k: Optional[int] = -1 - ignore_eos: Optional[bool] = False + + # doc: begin-completion-sampling-params use_beam_search: Optional[bool] = False + top_k: Optional[int] = -1 + min_p: Optional[float] = 0.0 + repetition_penalty: Optional[float] = 1.0 + length_penalty: Optional[float] = 1.0 early_stopping: Optional[bool] = False stop_token_ids: Optional[List[int]] = Field(default_factory=list) + ignore_eos: Optional[bool] = False skip_special_tokens: Optional[bool] = True spaces_between_special_tokens: Optional[bool] = True - repetition_penalty: Optional[float] = 1.0 - min_p: Optional[float] = 0.0 - include_stop_str_in_output: Optional[bool] = False - length_penalty: Optional[float] = 1.0 - guided_json: Optional[Union[str, dict, BaseModel]] = None - guided_regex: Optional[str] = None - guided_choice: Optional[List[str]] = None - guided_grammar: Optional[str] = None - response_format: Optional[ResponseFormat] = None + # doc: end-completion-sampling-params + + # doc: begin-completion-extra-params + include_stop_str_in_output: Optional[bool] = Field( + default=False, + description=( + "Whether to include the stop string in the output. " + "This is only applied when the stop or stop_token_ids is set."), + ) + response_format: Optional[ResponseFormat] = Field( + default=None, + description= + ("Similar to chat completion, this parameter specifies the format of " + "output. Only {'type': 'json_object'} or {'type': 'text' } is " + "supported."), + ) + guided_json: Optional[Union[str, dict, BaseModel]] = Field( + default=None, + description=("If specified, the output will follow the JSON schema."), + ) + guided_regex: Optional[str] = Field( + default=None, + description=( + "If specified, the output will follow the regex pattern."), + ) + guided_choice: Optional[List[str]] = Field( + default=None, + description=( + "If specified, the output will be exactly one of the choices."), + ) + guided_grammar: Optional[str] = Field( + default=None, + description=( + "If specified, the output will follow the context free grammar."), + ) + + # doc: end-completion-extra-params def to_sampling_params(self): echo_without_generation = self.echo and self.max_tokens == 0 diff --git a/vllm/utils.py b/vllm/utils.py index 729a4332af967..d4a8c962c3bfc 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -4,7 +4,6 @@ import subprocess import uuid import gc -from functools import cache from platform import uname from typing import List, Tuple, Union from packaging.version import parse, Version @@ -121,7 +120,6 @@ def is_hip() -> bool: return torch.version.hip is not None -@cache def is_neuron() -> bool: try: import transformers_neuronx @@ -130,7 +128,6 @@ def is_neuron() -> bool: return transformers_neuronx is not None -@cache def get_max_shared_memory_bytes(gpu: int = 0) -> int: """Returns the maximum shared memory per thread block in bytes.""" # NOTE: This import statement should be executed lazily since @@ -154,7 +151,6 @@ def random_uuid() -> str: return str(uuid.uuid4().hex) -@cache def in_wsl() -> bool: # Reference: https://github.com/microsoft/WSL/issues/4071 return "microsoft" in " ".join(uname()).lower() @@ -229,7 +225,6 @@ def set_cuda_visible_devices(device_ids: List[int]) -> None: os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, device_ids)) -@cache def get_nvcc_cuda_version() -> Optional[Version]: cuda_home = os.environ.get('CUDA_HOME') if not cuda_home: