From 7348f6165c1d6e3f260d591887d38c4561c4f122 Mon Sep 17 00:00:00 2001 From: lewtun Date: Mon, 16 Sep 2024 09:49:35 +0200 Subject: [PATCH 1/3] Expose revision arg in OpenAPI server This PR exposes the revision arg in the OpenAPI server so that one can run inference at a desired model revision. In particular, it resolves the following error that arises when passing `--revision` to `vllm serve`: ``` Traceback (most recent call last): File "/fsx/lewis/miniconda3/envs/mixeval/lib/python3.10/site-packages/huggingface_hub/utils/_errors.py", line 304, in hf_raise_for_status response.raise_for_status() File "/fsx/lewis/miniconda3/envs/mixeval/lib/python3.10/site-packages/requests/models.py", line 1024, in raise_for_status raise HTTPError(http_error_msg, response=self) requests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://huggingface.co/HuggingFaceH4/gemma-2-2b-gkd/resolve/main/config.json The above exception was the direct cause of the following exception: Traceback (most recent call last): File "/fsx/lewis/miniconda3/envs/mixeval/lib/python3.10/site-packages/transformers/utils/hub.py", line 402, in cached_file resolved_file = hf_hub_download( File "/fsx/lewis/miniconda3/envs/mixeval/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f return f(*args, **kwargs) File "/fsx/lewis/miniconda3/envs/mixeval/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn return fn(*args, **kwargs) File "/fsx/lewis/miniconda3/envs/mixeval/lib/python3.10/site-packages/huggingface_hub/file_download.py", line 1240, in hf_hub_download return _hf_hub_download_to_cache_dir( File "/fsx/lewis/miniconda3/envs/mixeval/lib/python3.10/site-packages/huggingface_hub/file_download.py", line 1303, in _hf_hub_download_to_cache_dir (url_to_download, etag, commit_hash, expected_size, head_call_error) = _get_metadata_or_catch_error( File "/fsx/lewis/miniconda3/envs/mixeval/lib/python3.10/site-packages/huggingface_hub/file_download.py", line 1752, in _get_metadata_or_catch_error metadata = get_hf_file_metadata( File "/fsx/lewis/miniconda3/envs/mixeval/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn return fn(*args, **kwargs) File "/fsx/lewis/miniconda3/envs/mixeval/lib/python3.10/site-packages/huggingface_hub/file_download.py", line 1674, in get_hf_file_metadata r = _request_wrapper( File "/fsx/lewis/miniconda3/envs/mixeval/lib/python3.10/site-packages/huggingface_hub/file_download.py", line 376, in _request_wrapper response = _request_wrapper( File "/fsx/lewis/miniconda3/envs/mixeval/lib/python3.10/site-packages/huggingface_hub/file_download.py", line 400, in _request_wrapper hf_raise_for_status(response) File "/fsx/lewis/miniconda3/envs/mixeval/lib/python3.10/site-packages/huggingface_hub/utils/_errors.py", line 315, in hf_raise_for_status raise EntryNotFoundError(message, response) from e huggingface_hub.utils._errors.EntryNotFoundError: 404 Client Error. (Request ID: Root=1-66e7e09a-17e4310c5f1032c2189efc96;f483305a-4822-4bd4-a502-3a867fafecc9) Entry Not Found for url: https://huggingface.co/HuggingFaceH4/gemma-2-2b-gkd/resolve/main/config.json. The above exception was the direct cause of the following exception: Traceback (most recent call last): File "/fsx/lewis/miniconda3/envs/mixeval/bin/vllm", line 8, in sys.exit(main()) File "/fsx/lewis/miniconda3/envs/mixeval/lib/python3.10/site-packages/vllm/scripts.py", line 165, in main args.dispatch_function(args) File "/fsx/lewis/miniconda3/envs/mixeval/lib/python3.10/site-packages/vllm/scripts.py", line 37, in serve asyncio.run(run_server(args)) File "/fsx/lewis/miniconda3/envs/mixeval/lib/python3.10/asyncio/runners.py", line 44, in run return loop.run_until_complete(main) File "/fsx/lewis/miniconda3/envs/mixeval/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete return future.result() File "/fsx/lewis/miniconda3/envs/mixeval/lib/python3.10/site-packages/vllm/entrypoints/openai/api_server.py", line 462, in run_server async with build_async_engine_client(args) as async_engine_client: File "/fsx/lewis/miniconda3/envs/mixeval/lib/python3.10/contextlib.py", line 199, in __aenter__ return await anext(self.gen) File "/fsx/lewis/miniconda3/envs/mixeval/lib/python3.10/site-packages/vllm/entrypoints/openai/api_server.py", line 108, in build_async_engine_client async with build_async_engine_client_from_engine_args( File "/fsx/lewis/miniconda3/envs/mixeval/lib/python3.10/contextlib.py", line 199, in __aenter__ return await anext(self.gen) File "/fsx/lewis/miniconda3/envs/mixeval/lib/python3.10/site-packages/vllm/entrypoints/openai/api_server.py", line 130, in build_async_engine_client_from_engine_args if (model_is_embedding(engine_args.model, engine_args.trust_remote_code, File "/fsx/lewis/miniconda3/envs/mixeval/lib/python3.10/site-packages/vllm/entrypoints/openai/api_server.py", line 71, in model_is_embedding return ModelConfig(model=model_name, File "/fsx/lewis/miniconda3/envs/mixeval/lib/python3.10/site-packages/vllm/config.py", line 176, in __init__ self.hf_config = get_config(self.model, trust_remote_code, revision, File "/fsx/lewis/miniconda3/envs/mixeval/lib/python3.10/site-packages/vllm/transformers_utils/config.py", line 66, in get_config config = AutoConfig.from_pretrained( File "/fsx/lewis/miniconda3/envs/mixeval/lib/python3.10/site-packages/transformers/models/auto/configuration_auto.py", line 976, in from_pretrained config_dict, unused_kwargs = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs) File "/fsx/lewis/miniconda3/envs/mixeval/lib/python3.10/site-packages/transformers/configuration_utils.py", line 632, in get_config_dict config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs) File "/fsx/lewis/miniconda3/envs/mixeval/lib/python3.10/site-packages/transformers/configuration_utils.py", line 689, in _get_config_dict resolved_config_file = cached_file( File "/fsx/lewis/miniconda3/envs/mixeval/lib/python3.10/site-packages/transformers/utils/hub.py", line 456, in cached_file raise EnvironmentError( OSError: HuggingFaceH4/gemma-2-2b-gkd does not appear to have a file named config.json. Checkout 'https://huggingface.co/HuggingFaceH4/gemma-2-2b-gkd/tree/main' for available files. ``` --- vllm/entrypoints/openai/api_server.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index d8704d5e24964..c3244848f6d30 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -68,9 +68,10 @@ _running_tasks: Set[asyncio.Task] = set() -def model_is_embedding(model_name: str, trust_remote_code: bool, +def model_is_embedding(model_name: str, revision: str, trust_remote_code: bool, quantization: Optional[str]) -> bool: return ModelConfig(model=model_name, + revision=revision, tokenizer=model_name, tokenizer_mode="auto", trust_remote_code=trust_remote_code, @@ -129,8 +130,8 @@ async def build_async_engine_client_from_engine_args( # If manually triggered or embedding model, use AsyncLLMEngine in process. # TODO: support embedding model via RPC. - if (model_is_embedding(engine_args.model, engine_args.trust_remote_code, - engine_args.quantization) + if (model_is_embedding(engine_args.model, engine_args.revision, + engine_args.trust_remote_code, engine_args.quantization) or disable_frontend_multiprocessing): engine_client = AsyncLLMEngine.from_engine_args( engine_args, usage_context=UsageContext.OPENAI_API_SERVER) From 65c0925f5247ea999b5b839d16041b140ddc82d0 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Mon, 16 Sep 2024 10:23:01 +0000 Subject: [PATCH 2/3] Fix style --- vllm/entrypoints/openai/api_server.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index c3244848f6d30..ca4a03d896821 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -130,8 +130,9 @@ async def build_async_engine_client_from_engine_args( # If manually triggered or embedding model, use AsyncLLMEngine in process. # TODO: support embedding model via RPC. - if (model_is_embedding(engine_args.model, engine_args.revision, - engine_args.trust_remote_code, engine_args.quantization) + if (model_is_embedding(engine_args.model, engine_args.revision, + engine_args.trust_remote_code, + engine_args.quantization) or disable_frontend_multiprocessing): engine_client = AsyncLLMEngine.from_engine_args( engine_args, usage_context=UsageContext.OPENAI_API_SERVER) From e85e5a4c8571e8fa2d039b7eb65fd153ac82d216 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Mon, 16 Sep 2024 10:32:04 +0000 Subject: [PATCH 3/3] Fix mypy --- vllm/entrypoints/openai/api_server.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index ca4a03d896821..7c1f307e06619 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -68,8 +68,9 @@ _running_tasks: Set[asyncio.Task] = set() -def model_is_embedding(model_name: str, revision: str, trust_remote_code: bool, - quantization: Optional[str]) -> bool: +def model_is_embedding(model_name: str, trust_remote_code: bool, + quantization: Optional[str], + revision: Optional[str]) -> bool: return ModelConfig(model=model_name, revision=revision, tokenizer=model_name, @@ -130,9 +131,8 @@ async def build_async_engine_client_from_engine_args( # If manually triggered or embedding model, use AsyncLLMEngine in process. # TODO: support embedding model via RPC. - if (model_is_embedding(engine_args.model, engine_args.revision, - engine_args.trust_remote_code, - engine_args.quantization) + if (model_is_embedding(engine_args.model, engine_args.trust_remote_code, + engine_args.quantization, engine_args.revision) or disable_frontend_multiprocessing): engine_client = AsyncLLMEngine.from_engine_args( engine_args, usage_context=UsageContext.OPENAI_API_SERVER)