From 0ca0b3d86cb714d1bef1ab7c2d037a6b41248e17 Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Thu, 28 Mar 2024 10:22:21 -0700 Subject: [PATCH 01/38] [optional-tokenizer] make tokenzier optional in initialization --- vllm/config.py | 7 ++++++- vllm/engine/arg_utils.py | 3 ++- vllm/engine/llm_engine.py | 17 ++++++++++------- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index e27c8eb4fd257..7d5941ae8aa35 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -71,6 +71,8 @@ class ModelConfig: max_context_len_to_capture: Maximum context len covered by CUDA graphs. When a sequence has context length larger than this, we fall back to eager mode. + enable_tokenizer: If False, will not enable tokenizer in the LLM engine. + User needs to feed prompt IDs for inference. """ def __init__( @@ -92,6 +94,7 @@ def __init__( enforce_eager: bool = False, max_context_len_to_capture: Optional[int] = None, max_logprobs: int = 5, + enable_tokenizer: bool = True, ) -> None: self.model = model self.tokenizer = tokenizer @@ -108,6 +111,7 @@ def __init__( self.enforce_eager = enforce_eager self.max_context_len_to_capture = max_context_len_to_capture self.max_logprobs = max_logprobs + self.enable_tokenizer = enable_tokenizer if os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true": # download model from ModelScope hub, @@ -132,7 +136,8 @@ def __init__( self.max_model_len = _get_and_verify_max_len(self.hf_text_config, max_model_len) self._verify_load_format() - self._verify_tokenizer_mode() + if self.enable_tokenizer: + self._verify_tokenizer_mode() self._verify_quantization() self._verify_cuda_graph() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index a6197942645e4..4951bd7f2beae 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -44,6 +44,7 @@ class EngineArgs: enforce_eager: bool = False max_context_len_to_capture: int = 8192 disable_custom_all_reduce: bool = False + enable_tokenizer: bool = False, tokenizer_pool_size: int = 0 tokenizer_pool_type: str = "ray" tokenizer_pool_extra_config: Optional[dict] = None @@ -422,7 +423,7 @@ def create_engine_config(self, ) -> EngineConfig: self.dtype, self.seed, self.revision, self.code_revision, self.tokenizer_revision, self.max_model_len, self.quantization, self.quantization_param_path, self.enforce_eager, - self.max_context_len_to_capture, self.max_logprobs) + self.max_context_len_to_capture, self.max_logprobs, self.enable_tokenizer) cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index c22585a3768f2..1ec254395ef52 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -112,8 +112,9 @@ def __init__( self.speculative_config = speculative_config self.log_stats = log_stats - self._init_tokenizer() - self.detokenizer = Detokenizer(self.tokenizer) + if self.model_config.enable_tokenizer: + self.detokenizer = Detokenizer(self.tokenizer) + self._init_tokenizer() self.seq_counter = Counter() self.model_executor = executor_class( @@ -162,9 +163,10 @@ def __init__( parallel_config.disable_custom_all_reduce, }) - # Ping the tokenizer to ensure liveness if it runs in a - # different process. - self.tokenizer.ping() + if self.model_config.enable_tokenizer: + # Ping the tokenizer to ensure liveness if it runs in a + # different process. + self.tokenizer.ping() # Create the scheduler. # NOTE: the cache_config here have been updated with the numbers of @@ -333,8 +335,9 @@ def add_request( # Create the sequences. block_size = self.cache_config.block_size seq_id = next(self.seq_counter) - eos_token_id = self.tokenizer.get_lora_tokenizer( - lora_request).eos_token_id + eos_token_id = None + if self.model_config.enable_tokenizer: + eos_token_id = self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id seq = Sequence(seq_id, prompt, prompt_token_ids, block_size, eos_token_id, lora_request) From a50f0c7efae07d1870475e5d8100f375c57dc3e3 Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Sat, 30 Mar 2024 12:14:52 -0700 Subject: [PATCH 02/38] [tokenizer] make detokenization optional --- vllm/config.py | 9 ++++----- vllm/engine/arg_utils.py | 4 ++-- vllm/engine/llm_engine.py | 12 ++++++++---- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 7d5941ae8aa35..5953468296435 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -71,8 +71,7 @@ class ModelConfig: max_context_len_to_capture: Maximum context len covered by CUDA graphs. When a sequence has context length larger than this, we fall back to eager mode. - enable_tokenizer: If False, will not enable tokenizer in the LLM engine. - User needs to feed prompt IDs for inference. + disable_tokenizer: If False, tokenization and detokenization are disabled. """ def __init__( @@ -94,7 +93,7 @@ def __init__( enforce_eager: bool = False, max_context_len_to_capture: Optional[int] = None, max_logprobs: int = 5, - enable_tokenizer: bool = True, + disable_tokenizer: bool = False, ) -> None: self.model = model self.tokenizer = tokenizer @@ -111,7 +110,7 @@ def __init__( self.enforce_eager = enforce_eager self.max_context_len_to_capture = max_context_len_to_capture self.max_logprobs = max_logprobs - self.enable_tokenizer = enable_tokenizer + self.disable_tokenizer = disable_tokenizer if os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true": # download model from ModelScope hub, @@ -136,7 +135,7 @@ def __init__( self.max_model_len = _get_and_verify_max_len(self.hf_text_config, max_model_len) self._verify_load_format() - if self.enable_tokenizer: + if not self.disable_tokenizer: self._verify_tokenizer_mode() self._verify_quantization() self._verify_cuda_graph() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 4951bd7f2beae..6929a228c37c5 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -44,7 +44,7 @@ class EngineArgs: enforce_eager: bool = False max_context_len_to_capture: int = 8192 disable_custom_all_reduce: bool = False - enable_tokenizer: bool = False, + disable_tokenizer: bool = False, tokenizer_pool_size: int = 0 tokenizer_pool_type: str = "ray" tokenizer_pool_extra_config: Optional[dict] = None @@ -423,7 +423,7 @@ def create_engine_config(self, ) -> EngineConfig: self.dtype, self.seed, self.revision, self.code_revision, self.tokenizer_revision, self.max_model_len, self.quantization, self.quantization_param_path, self.enforce_eager, - self.max_context_len_to_capture, self.max_logprobs, self.enable_tokenizer) + self.max_context_len_to_capture, self.max_logprobs, self.disable_tokenizer) cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 1ec254395ef52..ebece43d32766 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -112,9 +112,13 @@ def __init__( self.speculative_config = speculative_config self.log_stats = log_stats - if self.model_config.enable_tokenizer: + if not self.model_config.disable_tokenizer: self.detokenizer = Detokenizer(self.tokenizer) self._init_tokenizer() + else: + self.detokenizer = None + self.tokenizer = None + self.seq_counter = Counter() self.model_executor = executor_class( @@ -163,7 +167,7 @@ def __init__( parallel_config.disable_custom_all_reduce, }) - if self.model_config.enable_tokenizer: + if self.tokenizer: # Ping the tokenizer to ensure liveness if it runs in a # different process. self.tokenizer.ping() @@ -336,7 +340,7 @@ def add_request( block_size = self.cache_config.block_size seq_id = next(self.seq_counter) eos_token_id = None - if self.model_config.enable_tokenizer: + if self.tokenizer: eos_token_id = self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id seq = Sequence(seq_id, prompt, prompt_token_ids, block_size, eos_token_id, lora_request) @@ -481,7 +485,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, child_seqs.append((parent, parent)) for seq, _ in child_seqs: - if seq_group.sampling_params.detokenize: + if seq_group.sampling_params.detokenize and self.detokenizer: self.detokenizer.decode_sequence_inplace( seq, seq_group.sampling_params) self._check_stop(seq, seq_group.sampling_params) From e144c2e4c4469bfe22f759ea0b18d543622563d5 Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Sat, 30 Mar 2024 14:47:17 -0700 Subject: [PATCH 03/38] [tokenizer] fix parameter description --- vllm/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/config.py b/vllm/config.py index 5953468296435..7d0b9fa3ff8ba 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -71,7 +71,7 @@ class ModelConfig: max_context_len_to_capture: Maximum context len covered by CUDA graphs. When a sequence has context length larger than this, we fall back to eager mode. - disable_tokenizer: If False, tokenization and detokenization are disabled. + disable_tokenizer: If true, tokenization and detokenization are disabled. """ def __init__( From 5fb16f27fa76301dfe5d5ecba6824d6be1c44a4d Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Sat, 30 Mar 2024 18:44:32 -0700 Subject: [PATCH 04/38] [tokenizer] fix initialize engine args --- vllm/engine/arg_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 6929a228c37c5..57ed3c86f4c85 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -15,6 +15,7 @@ class EngineArgs: """Arguments for vLLM engine.""" model: str tokenizer: Optional[str] = None + disable_tokenizer: bool = False, tokenizer_mode: str = 'auto' trust_remote_code: bool = False download_dir: Optional[str] = None From 904edcc2f07a98d3e0bd7924edf4d6a71986b241 Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Sat, 30 Mar 2024 19:04:41 -0700 Subject: [PATCH 05/38] [tokenizer] fix format --- vllm/config.py | 3 ++- vllm/engine/llm_engine.py | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 7d0b9fa3ff8ba..50c80b7fe30d7 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -71,7 +71,8 @@ class ModelConfig: max_context_len_to_capture: Maximum context len covered by CUDA graphs. When a sequence has context length larger than this, we fall back to eager mode. - disable_tokenizer: If true, tokenization and detokenization are disabled. + disable_tokenizer: If true, tokenization and detokenization are + disabled. """ def __init__( diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index ebece43d32766..c0d41d4f290f1 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -341,7 +341,8 @@ def add_request( seq_id = next(self.seq_counter) eos_token_id = None if self.tokenizer: - eos_token_id = self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id + eos_token_id = self.tokenizer.get_lora_tokenizer( + lora_request).eos_token_id seq = Sequence(seq_id, prompt, prompt_token_ids, block_size, eos_token_id, lora_request) @@ -486,8 +487,8 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, for seq, _ in child_seqs: if seq_group.sampling_params.detokenize and self.detokenizer: - self.detokenizer.decode_sequence_inplace( - seq, seq_group.sampling_params) + self.detokenizer.decode_sequence_inplace(seq, + seq_group.sampling_params) self._check_stop(seq, seq_group.sampling_params) # Non-beam search case From cfc26600c6a7143724fae9241d161252c18f179d Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Sat, 30 Mar 2024 21:42:23 -0700 Subject: [PATCH 06/38] [tokenization] fix arg parser field --- vllm/engine/arg_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 57ed3c86f4c85..7a40a60754e26 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -45,7 +45,6 @@ class EngineArgs: enforce_eager: bool = False max_context_len_to_capture: int = 8192 disable_custom_all_reduce: bool = False - disable_tokenizer: bool = False, tokenizer_pool_size: int = 0 tokenizer_pool_type: str = "ray" tokenizer_pool_extra_config: Optional[dict] = None @@ -96,6 +95,10 @@ def add_cli_args( type=str, default=EngineArgs.tokenizer, help='name or path of the huggingface tokenizer to use') + parser.add_argument( + '--disable_tokenizer', + action='store_true', + help='Disable tokenization and detokenization') parser.add_argument( '--revision', type=str, From 013a36a30b462f70b100fdaf49fe0fee1d403c90 Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Sun, 31 Mar 2024 22:56:26 -0700 Subject: [PATCH 07/38] [tokenizer] fix the order of initializing tokenizer and de-tokenizer --- vllm/engine/llm_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index c0d41d4f290f1..254961849af4b 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -113,8 +113,8 @@ def __init__( self.log_stats = log_stats if not self.model_config.disable_tokenizer: - self.detokenizer = Detokenizer(self.tokenizer) self._init_tokenizer() + self.detokenizer = Detokenizer(self.tokenizer) else: self.detokenizer = None self.tokenizer = None From fb3eefde02b5cb5fdfb353803e733bc6117942b5 Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Mon, 1 Apr 2024 16:50:47 -0700 Subject: [PATCH 08/38] [tokenizer] Never disable tok in LLM initialization --- vllm/entrypoints/llm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 5777e8179a1c1..bb61919d47b68 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -95,6 +95,7 @@ def __init__( model=model, tokenizer=tokenizer, tokenizer_mode=tokenizer_mode, + disable_tokenizer=False, trust_remote_code=trust_remote_code, tensor_parallel_size=tensor_parallel_size, dtype=dtype, From 07cc2e579c82fa624587894d662bd93d6d60ee65 Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Mon, 1 Apr 2024 21:46:05 -0700 Subject: [PATCH 09/38] [tokenizer] Add flag value to log info to help debug --- vllm/engine/llm_engine.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 254961849af4b..9f901d9da23c6 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -83,6 +83,7 @@ def __init__( f"model={model_config.model!r}, " f"speculative_config={speculative_config!r}, " f"tokenizer={model_config.tokenizer!r}, " + f"disable_tokenizer={model_config.disable_tokenizer}, " f"tokenizer_mode={model_config.tokenizer_mode}, " f"revision={model_config.revision}, " f"tokenizer_revision={model_config.tokenizer_revision}, " From 5b308254c7097433e0ad22296a5a4d164c70f7ba Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Tue, 2 Apr 2024 00:37:31 -0700 Subject: [PATCH 10/38] [tokenizer] fix type with a ',', it becomes a tuple, and deemed as True. --- vllm/engine/arg_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 7a40a60754e26..695a28c047285 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -15,7 +15,7 @@ class EngineArgs: """Arguments for vLLM engine.""" model: str tokenizer: Optional[str] = None - disable_tokenizer: bool = False, + disable_tokenizer: bool = False tokenizer_mode: str = 'auto' trust_remote_code: bool = False download_dir: Optional[str] = None From 676256ffd32eac2d137a88df95a4337eebdd7d86 Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Tue, 2 Apr 2024 00:54:52 -0700 Subject: [PATCH 11/38] [tokenizer] fix yapf errors --- vllm/engine/arg_utils.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 695a28c047285..ce6b1a19a73c7 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -95,10 +95,9 @@ def add_cli_args( type=str, default=EngineArgs.tokenizer, help='name or path of the huggingface tokenizer to use') - parser.add_argument( - '--disable_tokenizer', - action='store_true', - help='Disable tokenization and detokenization') + parser.add_argument('--disable_tokenizer', + action='store_true', + help='Disable tokenization and detokenization') parser.add_argument( '--revision', type=str, From f7cd883882ff451ec928ca1f51feb0f1959fae28 Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Thu, 4 Apr 2024 10:17:20 -0700 Subject: [PATCH 12/38] [tokenizer] fix formatting --- vllm/engine/arg_utils.py | 3 ++- vllm/engine/llm_engine.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ce6b1a19a73c7..e9a8e3ba63c83 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -426,7 +426,8 @@ def create_engine_config(self, ) -> EngineConfig: self.dtype, self.seed, self.revision, self.code_revision, self.tokenizer_revision, self.max_model_len, self.quantization, self.quantization_param_path, self.enforce_eager, - self.max_context_len_to_capture, self.max_logprobs, self.disable_tokenizer) + self.max_context_len_to_capture, self.max_logprobs, + self.disable_tokenizer) cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 9f901d9da23c6..832a24c8d8c72 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -488,8 +488,8 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, for seq, _ in child_seqs: if seq_group.sampling_params.detokenize and self.detokenizer: - self.detokenizer.decode_sequence_inplace(seq, - seq_group.sampling_params) + self.detokenizer.decode_sequence_inplace( + seq, seq_group.sampling_params) self._check_stop(seq, seq_group.sampling_params) # Non-beam search case From 0ea844623ddb496ff5acb35828bd17dfac87b6b1 Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Thu, 28 Mar 2024 10:22:21 -0700 Subject: [PATCH 13/38] [optional-tokenizer] make tokenzier optional in initialization --- vllm/config.py | 7 ++++++- vllm/engine/arg_utils.py | 3 ++- vllm/engine/llm_engine.py | 17 ++++++++++------- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 6762a75f25f28..cb7101e0a5d01 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -71,6 +71,8 @@ class ModelConfig: max_context_len_to_capture: Maximum context len covered by CUDA graphs. When a sequence has context length larger than this, we fall back to eager mode. + enable_tokenizer: If False, will not enable tokenizer in the LLM engine. + User needs to feed prompt IDs for inference. """ def __init__( @@ -92,6 +94,7 @@ def __init__( enforce_eager: bool = False, max_context_len_to_capture: Optional[int] = None, max_logprobs: int = 5, + enable_tokenizer: bool = True, ) -> None: self.model = model self.tokenizer = tokenizer @@ -108,6 +111,7 @@ def __init__( self.enforce_eager = enforce_eager self.max_context_len_to_capture = max_context_len_to_capture self.max_logprobs = max_logprobs + self.enable_tokenizer = enable_tokenizer if os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true": # download model from ModelScope hub, @@ -132,7 +136,8 @@ def __init__( self.max_model_len = _get_and_verify_max_len(self.hf_text_config, max_model_len) self._verify_load_format() - self._verify_tokenizer_mode() + if self.enable_tokenizer: + self._verify_tokenizer_mode() self._verify_quantization() self._verify_cuda_graph() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index a6197942645e4..4951bd7f2beae 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -44,6 +44,7 @@ class EngineArgs: enforce_eager: bool = False max_context_len_to_capture: int = 8192 disable_custom_all_reduce: bool = False + enable_tokenizer: bool = False, tokenizer_pool_size: int = 0 tokenizer_pool_type: str = "ray" tokenizer_pool_extra_config: Optional[dict] = None @@ -422,7 +423,7 @@ def create_engine_config(self, ) -> EngineConfig: self.dtype, self.seed, self.revision, self.code_revision, self.tokenizer_revision, self.max_model_len, self.quantization, self.quantization_param_path, self.enforce_eager, - self.max_context_len_to_capture, self.max_logprobs) + self.max_context_len_to_capture, self.max_logprobs, self.enable_tokenizer) cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index a9a4a7b83d934..eb04b9aecbef8 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -112,8 +112,9 @@ def __init__( self.speculative_config = speculative_config self.log_stats = log_stats - self._init_tokenizer() - self.detokenizer = Detokenizer(self.tokenizer) + if self.model_config.enable_tokenizer: + self.detokenizer = Detokenizer(self.tokenizer) + self._init_tokenizer() self.seq_counter = Counter() self.model_executor = executor_class( @@ -162,9 +163,10 @@ def __init__( parallel_config.disable_custom_all_reduce, }) - # Ping the tokenizer to ensure liveness if it runs in a - # different process. - self.tokenizer.ping() + if self.model_config.enable_tokenizer: + # Ping the tokenizer to ensure liveness if it runs in a + # different process. + self.tokenizer.ping() # Create the scheduler. # NOTE: the cache_config here have been updated with the numbers of @@ -333,8 +335,9 @@ def add_request( # Create the sequences. block_size = self.cache_config.block_size seq_id = next(self.seq_counter) - eos_token_id = self.tokenizer.get_lora_tokenizer( - lora_request).eos_token_id + eos_token_id = None + if self.model_config.enable_tokenizer: + eos_token_id = self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id seq = Sequence(seq_id, prompt, prompt_token_ids, block_size, eos_token_id, lora_request) From a7be734504942d5acbde47f3c7fcb1804b69dd06 Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Sat, 30 Mar 2024 12:14:52 -0700 Subject: [PATCH 14/38] [tokenizer] make detokenization optional --- vllm/config.py | 9 ++++----- vllm/engine/arg_utils.py | 4 ++-- vllm/engine/llm_engine.py | 12 ++++++++---- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index cb7101e0a5d01..e49f02163c0ff 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -71,8 +71,7 @@ class ModelConfig: max_context_len_to_capture: Maximum context len covered by CUDA graphs. When a sequence has context length larger than this, we fall back to eager mode. - enable_tokenizer: If False, will not enable tokenizer in the LLM engine. - User needs to feed prompt IDs for inference. + disable_tokenizer: If False, tokenization and detokenization are disabled. """ def __init__( @@ -94,7 +93,7 @@ def __init__( enforce_eager: bool = False, max_context_len_to_capture: Optional[int] = None, max_logprobs: int = 5, - enable_tokenizer: bool = True, + disable_tokenizer: bool = False, ) -> None: self.model = model self.tokenizer = tokenizer @@ -111,7 +110,7 @@ def __init__( self.enforce_eager = enforce_eager self.max_context_len_to_capture = max_context_len_to_capture self.max_logprobs = max_logprobs - self.enable_tokenizer = enable_tokenizer + self.disable_tokenizer = disable_tokenizer if os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true": # download model from ModelScope hub, @@ -136,7 +135,7 @@ def __init__( self.max_model_len = _get_and_verify_max_len(self.hf_text_config, max_model_len) self._verify_load_format() - if self.enable_tokenizer: + if not self.disable_tokenizer: self._verify_tokenizer_mode() self._verify_quantization() self._verify_cuda_graph() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 4951bd7f2beae..6929a228c37c5 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -44,7 +44,7 @@ class EngineArgs: enforce_eager: bool = False max_context_len_to_capture: int = 8192 disable_custom_all_reduce: bool = False - enable_tokenizer: bool = False, + disable_tokenizer: bool = False, tokenizer_pool_size: int = 0 tokenizer_pool_type: str = "ray" tokenizer_pool_extra_config: Optional[dict] = None @@ -423,7 +423,7 @@ def create_engine_config(self, ) -> EngineConfig: self.dtype, self.seed, self.revision, self.code_revision, self.tokenizer_revision, self.max_model_len, self.quantization, self.quantization_param_path, self.enforce_eager, - self.max_context_len_to_capture, self.max_logprobs, self.enable_tokenizer) + self.max_context_len_to_capture, self.max_logprobs, self.disable_tokenizer) cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index eb04b9aecbef8..57479da642439 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -112,9 +112,13 @@ def __init__( self.speculative_config = speculative_config self.log_stats = log_stats - if self.model_config.enable_tokenizer: + if not self.model_config.disable_tokenizer: self.detokenizer = Detokenizer(self.tokenizer) self._init_tokenizer() + else: + self.detokenizer = None + self.tokenizer = None + self.seq_counter = Counter() self.model_executor = executor_class( @@ -163,7 +167,7 @@ def __init__( parallel_config.disable_custom_all_reduce, }) - if self.model_config.enable_tokenizer: + if self.tokenizer: # Ping the tokenizer to ensure liveness if it runs in a # different process. self.tokenizer.ping() @@ -336,7 +340,7 @@ def add_request( block_size = self.cache_config.block_size seq_id = next(self.seq_counter) eos_token_id = None - if self.model_config.enable_tokenizer: + if self.tokenizer: eos_token_id = self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id seq = Sequence(seq_id, prompt, prompt_token_ids, block_size, eos_token_id, lora_request) @@ -481,7 +485,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, child_seqs.append((parent, parent)) for seq, _ in child_seqs: - if seq_group.sampling_params.detokenize: + if seq_group.sampling_params.detokenize and self.detokenizer: self.detokenizer.decode_sequence_inplace( seq, seq_group.sampling_params) self._check_stop(seq, seq_group.sampling_params) From 1e613f6a23fc7bce4115ae040085923cb0aa92c6 Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Sat, 30 Mar 2024 14:47:17 -0700 Subject: [PATCH 15/38] [tokenizer] fix parameter description --- vllm/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/config.py b/vllm/config.py index e49f02163c0ff..80e89e6b4ad56 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -71,7 +71,7 @@ class ModelConfig: max_context_len_to_capture: Maximum context len covered by CUDA graphs. When a sequence has context length larger than this, we fall back to eager mode. - disable_tokenizer: If False, tokenization and detokenization are disabled. + disable_tokenizer: If true, tokenization and detokenization are disabled. """ def __init__( From 3b94adbc38112e2d6c733e6b58ec07c1b83c9a0f Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Sat, 30 Mar 2024 18:44:32 -0700 Subject: [PATCH 16/38] [tokenizer] fix initialize engine args --- vllm/engine/arg_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 6929a228c37c5..57ed3c86f4c85 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -15,6 +15,7 @@ class EngineArgs: """Arguments for vLLM engine.""" model: str tokenizer: Optional[str] = None + disable_tokenizer: bool = False, tokenizer_mode: str = 'auto' trust_remote_code: bool = False download_dir: Optional[str] = None From eab1dd74bae977709391fe04518d3bb0819d66cc Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Sat, 30 Mar 2024 19:04:41 -0700 Subject: [PATCH 17/38] [tokenizer] fix format --- vllm/config.py | 3 ++- vllm/engine/llm_engine.py | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 80e89e6b4ad56..db3cdfaafc8d9 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -71,7 +71,8 @@ class ModelConfig: max_context_len_to_capture: Maximum context len covered by CUDA graphs. When a sequence has context length larger than this, we fall back to eager mode. - disable_tokenizer: If true, tokenization and detokenization are disabled. + disable_tokenizer: If true, tokenization and detokenization are + disabled. """ def __init__( diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 57479da642439..ef899c4bb2f3f 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -341,7 +341,8 @@ def add_request( seq_id = next(self.seq_counter) eos_token_id = None if self.tokenizer: - eos_token_id = self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id + eos_token_id = self.tokenizer.get_lora_tokenizer( + lora_request).eos_token_id seq = Sequence(seq_id, prompt, prompt_token_ids, block_size, eos_token_id, lora_request) @@ -486,8 +487,8 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, for seq, _ in child_seqs: if seq_group.sampling_params.detokenize and self.detokenizer: - self.detokenizer.decode_sequence_inplace( - seq, seq_group.sampling_params) + self.detokenizer.decode_sequence_inplace(seq, + seq_group.sampling_params) self._check_stop(seq, seq_group.sampling_params) # Non-beam search case From 58ccf643b0dfcf803c3d05d1e1271384fe4d4c20 Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Sat, 30 Mar 2024 21:42:23 -0700 Subject: [PATCH 18/38] [tokenization] fix arg parser field --- vllm/engine/arg_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 57ed3c86f4c85..7a40a60754e26 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -45,7 +45,6 @@ class EngineArgs: enforce_eager: bool = False max_context_len_to_capture: int = 8192 disable_custom_all_reduce: bool = False - disable_tokenizer: bool = False, tokenizer_pool_size: int = 0 tokenizer_pool_type: str = "ray" tokenizer_pool_extra_config: Optional[dict] = None @@ -96,6 +95,10 @@ def add_cli_args( type=str, default=EngineArgs.tokenizer, help='name or path of the huggingface tokenizer to use') + parser.add_argument( + '--disable_tokenizer', + action='store_true', + help='Disable tokenization and detokenization') parser.add_argument( '--revision', type=str, From a0d140535a1178a61c9438450b1e77d513d07c70 Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Sun, 31 Mar 2024 22:56:26 -0700 Subject: [PATCH 19/38] [tokenizer] fix the order of initializing tokenizer and de-tokenizer --- vllm/engine/llm_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index ef899c4bb2f3f..22284f792cbe1 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -113,8 +113,8 @@ def __init__( self.log_stats = log_stats if not self.model_config.disable_tokenizer: - self.detokenizer = Detokenizer(self.tokenizer) self._init_tokenizer() + self.detokenizer = Detokenizer(self.tokenizer) else: self.detokenizer = None self.tokenizer = None From 0af6b477346e1a0d277866770f1bae449eaca4cc Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Mon, 1 Apr 2024 16:50:47 -0700 Subject: [PATCH 20/38] [tokenizer] Never disable tok in LLM initialization --- vllm/entrypoints/llm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 5777e8179a1c1..bb61919d47b68 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -95,6 +95,7 @@ def __init__( model=model, tokenizer=tokenizer, tokenizer_mode=tokenizer_mode, + disable_tokenizer=False, trust_remote_code=trust_remote_code, tensor_parallel_size=tensor_parallel_size, dtype=dtype, From a497ed97a7c370d8000467986096553db8e72bb4 Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Mon, 1 Apr 2024 21:46:05 -0700 Subject: [PATCH 21/38] [tokenizer] Add flag value to log info to help debug --- vllm/engine/llm_engine.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 22284f792cbe1..19889d27760e4 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -83,6 +83,7 @@ def __init__( f"model={model_config.model!r}, " f"speculative_config={speculative_config!r}, " f"tokenizer={model_config.tokenizer!r}, " + f"disable_tokenizer={model_config.disable_tokenizer}, " f"tokenizer_mode={model_config.tokenizer_mode}, " f"revision={model_config.revision}, " f"tokenizer_revision={model_config.tokenizer_revision}, " From af078a8afc3cb80d9dc332f997257ba79269415c Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Tue, 2 Apr 2024 00:37:31 -0700 Subject: [PATCH 22/38] [tokenizer] fix type with a ',', it becomes a tuple, and deemed as True. --- vllm/engine/arg_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 7a40a60754e26..695a28c047285 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -15,7 +15,7 @@ class EngineArgs: """Arguments for vLLM engine.""" model: str tokenizer: Optional[str] = None - disable_tokenizer: bool = False, + disable_tokenizer: bool = False tokenizer_mode: str = 'auto' trust_remote_code: bool = False download_dir: Optional[str] = None From ad2c920f54ef9a63463ad4976ac4ba3feae88427 Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Tue, 2 Apr 2024 00:54:52 -0700 Subject: [PATCH 23/38] [tokenizer] fix yapf errors --- vllm/engine/arg_utils.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 695a28c047285..ce6b1a19a73c7 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -95,10 +95,9 @@ def add_cli_args( type=str, default=EngineArgs.tokenizer, help='name or path of the huggingface tokenizer to use') - parser.add_argument( - '--disable_tokenizer', - action='store_true', - help='Disable tokenization and detokenization') + parser.add_argument('--disable_tokenizer', + action='store_true', + help='Disable tokenization and detokenization') parser.add_argument( '--revision', type=str, From 78d40919d60ed82f82b5a57f50eafe06b7f91bad Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Thu, 4 Apr 2024 10:17:20 -0700 Subject: [PATCH 24/38] [tokenizer] fix formatting --- vllm/engine/arg_utils.py | 3 ++- vllm/engine/llm_engine.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ce6b1a19a73c7..e9a8e3ba63c83 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -426,7 +426,8 @@ def create_engine_config(self, ) -> EngineConfig: self.dtype, self.seed, self.revision, self.code_revision, self.tokenizer_revision, self.max_model_len, self.quantization, self.quantization_param_path, self.enforce_eager, - self.max_context_len_to_capture, self.max_logprobs, self.disable_tokenizer) + self.max_context_len_to_capture, self.max_logprobs, + self.disable_tokenizer) cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 19889d27760e4..3ce4388fb1585 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -488,8 +488,8 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, for seq, _ in child_seqs: if seq_group.sampling_params.detokenize and self.detokenizer: - self.detokenizer.decode_sequence_inplace(seq, - seq_group.sampling_params) + self.detokenizer.decode_sequence_inplace( + seq, seq_group.sampling_params) self._check_stop(seq, seq_group.sampling_params) # Non-beam search case From 4f67490aa45a9e88459c160555f7c7180facf44a Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Sat, 6 Apr 2024 21:14:19 -0700 Subject: [PATCH 25/38] [tokenizer] fix EngineArgs --- vllm/entrypoints/llm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index bb61919d47b68..5777e8179a1c1 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -95,7 +95,6 @@ def __init__( model=model, tokenizer=tokenizer, tokenizer_mode=tokenizer_mode, - disable_tokenizer=False, trust_remote_code=trust_remote_code, tensor_parallel_size=tensor_parallel_size, dtype=dtype, From 59fc5eb31a5a93282436763013a40bc7827bde2d Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Sat, 6 Apr 2024 22:52:44 -0700 Subject: [PATCH 26/38] [tokenizer] fix init LLM --- vllm/entrypoints/llm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index bb61919d47b68..5777e8179a1c1 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -95,7 +95,6 @@ def __init__( model=model, tokenizer=tokenizer, tokenizer_mode=tokenizer_mode, - disable_tokenizer=False, trust_remote_code=trust_remote_code, tensor_parallel_size=tensor_parallel_size, dtype=dtype, From ac7a3d4a2bb0fe84b2b8b22eaedfac04efaea851 Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Wed, 10 Apr 2024 15:44:14 -0700 Subject: [PATCH 27/38] [tokenizer] rename the flag --- vllm/config.py | 8 ++++---- vllm/engine/arg_utils.py | 11 ++++++----- vllm/engine/llm_engine.py | 4 ++-- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index db3cdfaafc8d9..844e0c5a6e4a3 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -71,7 +71,7 @@ class ModelConfig: max_context_len_to_capture: Maximum context len covered by CUDA graphs. When a sequence has context length larger than this, we fall back to eager mode. - disable_tokenizer: If true, tokenization and detokenization are + skip_tokenizer_init: If true, tokenization and detokenization are disabled. """ @@ -94,7 +94,7 @@ def __init__( enforce_eager: bool = False, max_context_len_to_capture: Optional[int] = None, max_logprobs: int = 5, - disable_tokenizer: bool = False, + skip_tokenizer_init: bool = False, ) -> None: self.model = model self.tokenizer = tokenizer @@ -111,7 +111,7 @@ def __init__( self.enforce_eager = enforce_eager self.max_context_len_to_capture = max_context_len_to_capture self.max_logprobs = max_logprobs - self.disable_tokenizer = disable_tokenizer + self.skip_tokenizer_init = skip_tokenizer_init if os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true": # download model from ModelScope hub, @@ -136,7 +136,7 @@ def __init__( self.max_model_len = _get_and_verify_max_len(self.hf_text_config, max_model_len) self._verify_load_format() - if not self.disable_tokenizer: + if not self.skip_tokenizer_init: self._verify_tokenizer_mode() self._verify_quantization() self._verify_cuda_graph() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index e9a8e3ba63c83..173db54e46306 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -15,7 +15,7 @@ class EngineArgs: """Arguments for vLLM engine.""" model: str tokenizer: Optional[str] = None - disable_tokenizer: bool = False + skip_tokenizer_init: bool = False tokenizer_mode: str = 'auto' trust_remote_code: bool = False download_dir: Optional[str] = None @@ -95,9 +95,10 @@ def add_cli_args( type=str, default=EngineArgs.tokenizer, help='name or path of the huggingface tokenizer to use') - parser.add_argument('--disable_tokenizer', - action='store_true', - help='Disable tokenization and detokenization') + parser.add_argument( + '--skip_tokenizer_init', + action='store_true', + help='Skip initialization of tokenizer and detokenizer') parser.add_argument( '--revision', type=str, @@ -427,7 +428,7 @@ def create_engine_config(self, ) -> EngineConfig: self.tokenizer_revision, self.max_model_len, self.quantization, self.quantization_param_path, self.enforce_eager, self.max_context_len_to_capture, self.max_logprobs, - self.disable_tokenizer) + self.skip_tokenizer_init) cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 3ce4388fb1585..55ebfa0fda2bf 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -83,7 +83,7 @@ def __init__( f"model={model_config.model!r}, " f"speculative_config={speculative_config!r}, " f"tokenizer={model_config.tokenizer!r}, " - f"disable_tokenizer={model_config.disable_tokenizer}, " + f"skip_tokenizer_init={model_config.skip_tokenizer_init}, " f"tokenizer_mode={model_config.tokenizer_mode}, " f"revision={model_config.revision}, " f"tokenizer_revision={model_config.tokenizer_revision}, " @@ -113,7 +113,7 @@ def __init__( self.speculative_config = speculative_config self.log_stats = log_stats - if not self.model_config.disable_tokenizer: + if not self.model_config.skip_tokenizer_init: self._init_tokenizer() self.detokenizer = Detokenizer(self.tokenizer) else: From 400224d26505d326d207e79a3fa7627508a69d62 Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Wed, 10 Apr 2024 15:44:14 -0700 Subject: [PATCH 28/38] [tokenizer] rename the flag --- vllm/config.py | 10 +++++----- vllm/engine/arg_utils.py | 11 ++++++----- vllm/engine/llm_engine.py | 4 ++-- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index db3cdfaafc8d9..fefdfefd362c8 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -71,8 +71,8 @@ class ModelConfig: max_context_len_to_capture: Maximum context len covered by CUDA graphs. When a sequence has context length larger than this, we fall back to eager mode. - disable_tokenizer: If true, tokenization and detokenization are - disabled. + skip_tokenizer_init: If true, skip initialization of tokenizer and + detokenizer. """ def __init__( @@ -94,7 +94,7 @@ def __init__( enforce_eager: bool = False, max_context_len_to_capture: Optional[int] = None, max_logprobs: int = 5, - disable_tokenizer: bool = False, + skip_tokenizer_init: bool = False, ) -> None: self.model = model self.tokenizer = tokenizer @@ -111,7 +111,7 @@ def __init__( self.enforce_eager = enforce_eager self.max_context_len_to_capture = max_context_len_to_capture self.max_logprobs = max_logprobs - self.disable_tokenizer = disable_tokenizer + self.skip_tokenizer_init = skip_tokenizer_init if os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true": # download model from ModelScope hub, @@ -136,7 +136,7 @@ def __init__( self.max_model_len = _get_and_verify_max_len(self.hf_text_config, max_model_len) self._verify_load_format() - if not self.disable_tokenizer: + if not self.skip_tokenizer_init: self._verify_tokenizer_mode() self._verify_quantization() self._verify_cuda_graph() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index e9a8e3ba63c83..173db54e46306 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -15,7 +15,7 @@ class EngineArgs: """Arguments for vLLM engine.""" model: str tokenizer: Optional[str] = None - disable_tokenizer: bool = False + skip_tokenizer_init: bool = False tokenizer_mode: str = 'auto' trust_remote_code: bool = False download_dir: Optional[str] = None @@ -95,9 +95,10 @@ def add_cli_args( type=str, default=EngineArgs.tokenizer, help='name or path of the huggingface tokenizer to use') - parser.add_argument('--disable_tokenizer', - action='store_true', - help='Disable tokenization and detokenization') + parser.add_argument( + '--skip_tokenizer_init', + action='store_true', + help='Skip initialization of tokenizer and detokenizer') parser.add_argument( '--revision', type=str, @@ -427,7 +428,7 @@ def create_engine_config(self, ) -> EngineConfig: self.tokenizer_revision, self.max_model_len, self.quantization, self.quantization_param_path, self.enforce_eager, self.max_context_len_to_capture, self.max_logprobs, - self.disable_tokenizer) + self.skip_tokenizer_init) cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 3ce4388fb1585..55ebfa0fda2bf 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -83,7 +83,7 @@ def __init__( f"model={model_config.model!r}, " f"speculative_config={speculative_config!r}, " f"tokenizer={model_config.tokenizer!r}, " - f"disable_tokenizer={model_config.disable_tokenizer}, " + f"skip_tokenizer_init={model_config.skip_tokenizer_init}, " f"tokenizer_mode={model_config.tokenizer_mode}, " f"revision={model_config.revision}, " f"tokenizer_revision={model_config.tokenizer_revision}, " @@ -113,7 +113,7 @@ def __init__( self.speculative_config = speculative_config self.log_stats = log_stats - if not self.model_config.disable_tokenizer: + if not self.model_config.skip_tokenizer_init: self._init_tokenizer() self.detokenizer = Detokenizer(self.tokenizer) else: From 69416284d431c86e09c6be38c61061ba5e829ee6 Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Wed, 10 Apr 2024 17:08:45 -0700 Subject: [PATCH 29/38] [tokenizer] add integration test --- tests/engine/test_skip_tokenizer_init.py | 23 +++++++++++++++++++++++ vllm/entrypoints/llm.py | 7 +++++++ 2 files changed, 30 insertions(+) create mode 100644 tests/engine/test_skip_tokenizer_init.py diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py new file mode 100644 index 0000000000000..d82ca30fcb534 --- /dev/null +++ b/tests/engine/test_skip_tokenizer_init.py @@ -0,0 +1,23 @@ +import pytest + +from vllm.entrypoints.llm import LLM +from vllm.sampling_params import SamplingParams + + +@pytest.mark.parametrize("model", ["facebook/opt-125m"]) +def test_skip_tokenizer_init(model: str): + # This test checks if the flag skip_tokenizer_init skips the initialization + # of tokenizer and detokenizer. The generated output is expected to contain token ids. + prompt = ( + "You are a helpful assistant. How do I build a car from cardboard and " + "paper clips? Is there an easy to follow video tutorial available " + "online for free?") + + llm = LLM(model=model, skip_tokenizer_init=True) + sampling_params = SamplingParams(max_tokens=10, + temperature=0.0, + detokenize=False) + + with pytest.raises(ValueError) as err: + llm.generate(prompt, sampling_params) + assert "prompts must be None if skip_tokenizer_init is True" in str(err.value) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 5777e8179a1c1..f4d70db621ce9 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -32,6 +32,9 @@ class LLM: tokenizer: The name or path of a HuggingFace Transformers tokenizer. tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer if available, and "slow" will always use the slow tokenizer. + skip_tokenizer_init: If true, skip initialization of tokenizer and + detokenizer. Expect valid prompt_token_ids and None for prompt + from the input. trust_remote_code: Trust remote code (e.g., from HuggingFace) when downloading the model and tokenizer. tensor_parallel_size: The number of GPUs to use for distributed @@ -75,6 +78,7 @@ def __init__( model: str, tokenizer: Optional[str] = None, tokenizer_mode: str = "auto", + skip_tokenizer_init: bool = False, trust_remote_code: bool = False, tensor_parallel_size: int = 1, dtype: str = "auto", @@ -95,6 +99,7 @@ def __init__( model=model, tokenizer=tokenizer, tokenizer_mode=tokenizer_mode, + skip_tokenizer_init=skip_tokenizer_init, trust_remote_code=trust_remote_code, tensor_parallel_size=tensor_parallel_size, dtype=dtype, @@ -155,6 +160,8 @@ def generate( if prompts is None and prompt_token_ids is None: raise ValueError("Either prompts or prompt_token_ids must be " "provided.") + if self.llm_engine.model_config.skip_tokenizer_init and prompts is not None: + raise ValueError("prompts must be None if skip_tokenizer_init is True") if isinstance(prompts, str): # Convert a single prompt to a list. prompts = [prompts] From ad4da7cfea5d5ceea4ce284add4e42b69f9fa714 Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Wed, 10 Apr 2024 17:08:45 -0700 Subject: [PATCH 30/38] [tokenizer] add integration test --- tests/engine/test_skip_tokenizer_init.py | 24 ++++++++++++++++++++++++ vllm/entrypoints/llm.py | 9 +++++++++ 2 files changed, 33 insertions(+) create mode 100644 tests/engine/test_skip_tokenizer_init.py diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py new file mode 100644 index 0000000000000..ede73292c0a76 --- /dev/null +++ b/tests/engine/test_skip_tokenizer_init.py @@ -0,0 +1,24 @@ +import pytest + +from vllm.entrypoints.llm import LLM +from vllm.sampling_params import SamplingParams + + +@pytest.mark.parametrize("model", ["facebook/opt-125m"]) +def test_skip_tokenizer_initialization(model: str): + # This test checks if the flag skip_tokenizer_init skips the initialization + # of tokenizer and detokenizer. The generated output is expected to contain + # token ids. + prompt = ( + "You are a helpful assistant. How do I build a car from cardboard and " + "paper clips? Is there an easy to follow video tutorial available " + "online for free?") + + llm = LLM(model=model, skip_tokenizer_init=True) + sampling_params = SamplingParams(max_tokens=10, + temperature=0.0, + detokenize=False) + + with pytest.raises(ValueError) as err: + llm.generate(prompt, sampling_params) + assert "prompts must be None if" in str(err.value) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 5777e8179a1c1..dc51de5db8cd3 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -32,6 +32,9 @@ class LLM: tokenizer: The name or path of a HuggingFace Transformers tokenizer. tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer if available, and "slow" will always use the slow tokenizer. + skip_tokenizer_init: If true, skip initialization of tokenizer and + detokenizer. Expect valid prompt_token_ids and None for prompt + from the input. trust_remote_code: Trust remote code (e.g., from HuggingFace) when downloading the model and tokenizer. tensor_parallel_size: The number of GPUs to use for distributed @@ -75,6 +78,7 @@ def __init__( model: str, tokenizer: Optional[str] = None, tokenizer_mode: str = "auto", + skip_tokenizer_init: bool = False, trust_remote_code: bool = False, tensor_parallel_size: int = 1, dtype: str = "auto", @@ -95,6 +99,7 @@ def __init__( model=model, tokenizer=tokenizer, tokenizer_mode=tokenizer_mode, + skip_tokenizer_init=skip_tokenizer_init, trust_remote_code=trust_remote_code, tensor_parallel_size=tensor_parallel_size, dtype=dtype, @@ -155,6 +160,10 @@ def generate( if prompts is None and prompt_token_ids is None: raise ValueError("Either prompts or prompt_token_ids must be " "provided.") + if self.llm_engine.model_config.skip_tokenizer_init \ + and prompts is not None: + raise ValueError("prompts must be None if skip_tokenizer_init " + "is True") if isinstance(prompts, str): # Convert a single prompt to a list. prompts = [prompts] From 4b3c5e3c5c511e667cd9921ce6e8786ab66d963c Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Wed, 10 Apr 2024 18:11:01 -0700 Subject: [PATCH 31/38] [tokenizer] test generate based on prompt token ids --- tests/engine/test_skip_tokenizer_init.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py index ede73292c0a76..56711eb289ffb 100644 --- a/tests/engine/test_skip_tokenizer_init.py +++ b/tests/engine/test_skip_tokenizer_init.py @@ -9,16 +9,14 @@ def test_skip_tokenizer_initialization(model: str): # This test checks if the flag skip_tokenizer_init skips the initialization # of tokenizer and detokenizer. The generated output is expected to contain # token ids. - prompt = ( - "You are a helpful assistant. How do I build a car from cardboard and " - "paper clips? Is there an easy to follow video tutorial available " - "online for free?") - llm = LLM(model=model, skip_tokenizer_init=True) - sampling_params = SamplingParams(max_tokens=10, - temperature=0.0, - detokenize=False) - + sampling_params = SamplingParams() with pytest.raises(ValueError) as err: - llm.generate(prompt, sampling_params) + llm.generate("abc", sampling_params) assert "prompts must be None if" in str(err.value) + outputs = llm.generate(prompt_token_ids=[1, 2, 3], + sampling_params=sampling_params) + assert len(outputs) > 0 + completions = outputs[0].outputs + assert len(completions) > 0 + assert completions[0].text == "" From 87c695b260cb1ecbeede25db49a28ba783129aaf Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Wed, 10 Apr 2024 19:17:25 -0700 Subject: [PATCH 32/38] [tokenizer] more tests --- tests/engine/test_skip_tokenizer_init.py | 5 +++-- vllm/engine/llm_engine.py | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py index 56711eb289ffb..baa463a316902 100644 --- a/tests/engine/test_skip_tokenizer_init.py +++ b/tests/engine/test_skip_tokenizer_init.py @@ -10,13 +10,14 @@ def test_skip_tokenizer_initialization(model: str): # of tokenizer and detokenizer. The generated output is expected to contain # token ids. llm = LLM(model=model, skip_tokenizer_init=True) - sampling_params = SamplingParams() + sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True) with pytest.raises(ValueError) as err: llm.generate("abc", sampling_params) assert "prompts must be None if" in str(err.value) - outputs = llm.generate(prompt_token_ids=[1, 2, 3], + outputs = llm.generate(prompt_token_ids=[[1, 2, 3]], sampling_params=sampling_params) assert len(outputs) > 0 completions = outputs[0].outputs assert len(completions) > 0 assert completions[0].text == "" + assert completions[0].token_ids diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 55ebfa0fda2bf..68dfb8ff42691 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -441,7 +441,9 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, # Process prompt logprobs prompt_logprobs = outputs.prompt_logprobs - if prompt_logprobs is not None and seq_group.sampling_params.detokenize: + if prompt_logprobs is not None \ + and seq_group.sampling_params.detokenize \ + and self.detokenizer: self.detokenizer.decode_prompt_logprobs_inplace( seq_group, prompt_logprobs) seq_group.prompt_logprobs = prompt_logprobs From aa5ec54947539b0e36aa761b01c024de556f64cd Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Fri, 12 Apr 2024 16:57:40 -0700 Subject: [PATCH 33/38] [tokenizer] consider finialize sequence --- vllm/engine/llm_engine.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 68dfb8ff42691..6b466a911edf6 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -811,11 +811,12 @@ def _check_stop(self, seq: Sequence, return last_token_id = seq.get_last_token_id() if last_token_id in sampling_params.stop_token_ids: - stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens( - last_token_id) - self._finalize_sequence(seq, sampling_params, stop_str) seq.status = SequenceStatus.FINISHED_STOPPED seq.stop_reason = last_token_id + if sampling_params.detokenize: + stop_str = self.get_tokenizer_for_seq(seq)\ + .convert_ids_to_tokens(last_token_id) + self._finalize_sequence(seq, sampling_params, stop_str) return # Check if the sequence has generated the EOS token. From 68f77b170f95803516ec77a3c48ebd04839b2e21 Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Tue, 16 Apr 2024 17:27:13 -0700 Subject: [PATCH 34/38] Merge branch 'main' into optional-tokenizer --- vllm/engine/llm_engine.py | 178 -------------------------------------- 1 file changed, 178 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index c7b4c64777ece..fa12ada7d63ae 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -439,184 +439,6 @@ def has_unfinished_requests(self) -> bool: """Returns True if there are unfinished requests.""" return self.scheduler.has_unfinished_seqs() - def _process_sequence_group_outputs(self, seq_group: SequenceGroup, - outputs: SequenceGroupOutput) -> None: - - # Process prompt logprobs - prompt_logprobs = outputs.prompt_logprobs - if prompt_logprobs is not None \ - and seq_group.sampling_params.detokenize \ - and self.detokenizer: - self.detokenizer.decode_prompt_logprobs_inplace( - seq_group, prompt_logprobs) - seq_group.prompt_logprobs = prompt_logprobs - - # Process samples - samples = outputs.samples - parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING) - existing_finished_seqs = seq_group.get_finished_seqs() - parent_child_dict = { - parent_seq.seq_id: [] - for parent_seq in parent_seqs - } - for sample in samples: - parent_child_dict[sample.parent_seq_id].append(sample) - # List of (child, parent) - child_seqs: List[Tuple[Sequence, Sequence]] = [] - - # Process the child samples for each parent sequence - for parent in parent_seqs: - child_samples: List[SequenceOutput] = parent_child_dict[ - parent.seq_id] - if len(child_samples) == 0: - # This parent sequence has no children samples. Remove - # the parent sequence from the sequence group since it will - # not be used in the future iterations. - parent.status = SequenceStatus.FINISHED_ABORTED - seq_group.remove(parent.seq_id) - self.scheduler.free_seq(parent) - continue - # Fork the parent sequence if there are multiple child samples. - for child_sample in child_samples[:-1]: - new_child_seq_id = next(self.seq_counter) - child = parent.fork(new_child_seq_id) - child.append_token_id(child_sample.output_token, - child_sample.logprobs) - child_seqs.append((child, parent)) - # Continue the parent sequence for the last child sample. - # We reuse the parent sequence here to reduce redundant memory - # copies, especially when using non-beam search sampling methods. - last_child_sample = child_samples[-1] - parent.append_token_id(last_child_sample.output_token, - last_child_sample.logprobs) - child_seqs.append((parent, parent)) - - for seq, _ in child_seqs: - if seq_group.sampling_params.detokenize and self.detokenizer: - new_char_count = self.detokenizer.decode_sequence_inplace( - seq, seq_group.sampling_params) - else: - new_char_count = 0 - self._check_stop(seq, new_char_count, seq_group.sampling_params) - - # Non-beam search case - if not seq_group.sampling_params.use_beam_search: - # For newly created child sequences, add them to the sequence group - # and fork them in block manager if they are not finished. - for seq, parent in child_seqs: - if seq is not parent: - seq_group.add(seq) - if not seq.is_finished(): - self.scheduler.fork_seq(parent, seq) - - # Free the finished and selected parent sequences' memory in block - # manager. Keep them in the sequence group as candidate output. - # NOTE: we need to fork the new sequences before freeing the - # old sequences. - for seq, parent in child_seqs: - if seq is parent and seq.is_finished(): - self.scheduler.free_seq(seq) - return - - # Beam search case - # Select the child sequences to keep in the sequence group. - selected_child_seqs = [] - unselected_child_seqs = [] - beam_width = seq_group.sampling_params.best_of - length_penalty = seq_group.sampling_params.length_penalty - - # Select the newly finished sequences with the highest scores - # to replace existing finished sequences. - # Tuple of (seq, parent, is_new) - existing_finished_seqs = [(seq, None, False) - for seq in existing_finished_seqs] - new_finished_seqs = [(seq, parent, True) for seq, parent in child_seqs - if seq.is_finished()] - all_finished_seqs = existing_finished_seqs + new_finished_seqs - # Sort the finished sequences by their scores. - all_finished_seqs.sort(key=lambda x: x[0].get_beam_search_score( - length_penalty=length_penalty, eos_token_id=x[0].eos_token_id), - reverse=True) - for seq, parent, is_new in all_finished_seqs[:beam_width]: - if is_new: - # A newly generated child sequence finishes and has a high - # score, so we will add it into the sequence group. - selected_child_seqs.append((seq, parent)) - for seq, parent, is_new in all_finished_seqs[beam_width:]: - if is_new: - # A newly generated child sequence finishes but has a low - # score, so we will not add it into the sequence group. - # Additionally, if this sequence is a continuation of a - # parent sequence, we will need remove the parent sequence - # from the sequence group. - unselected_child_seqs.append((seq, parent)) - else: - # An existing finished sequence has a low score, so we will - # remove it from the sequence group. - seq_group.remove(seq.seq_id) - - # select the top beam_width sequences from the running - # sequences for the next iteration to continue the beam - # search. - running_child_seqs = [(seq, parent) for seq, parent in child_seqs - if not seq.is_finished()] - # Sort the running sequences by their scores. - running_child_seqs.sort(key=lambda x: x[0].get_beam_search_score( - length_penalty=length_penalty, eos_token_id=x[0].eos_token_id), - reverse=True) - - # Check if we can stop the beam search. - if len(running_child_seqs) == 0: - # No running sequences, stop the beam search. - stop_beam_search = True - elif len(all_finished_seqs) < beam_width: - # Not enough finished sequences, continue the beam search. - stop_beam_search = False - else: - # Check the early stopping criteria - best_running_seq = running_child_seqs[0][0] - current_worst_seq = all_finished_seqs[beam_width - 1][0] - stop_beam_search = self._check_beam_search_early_stopping( - seq_group.sampling_params.early_stopping, - seq_group.sampling_params, best_running_seq, current_worst_seq) - - if stop_beam_search: - # Stop the beam search and remove all the running sequences from - # the sequence group. - unselected_child_seqs.extend(running_child_seqs) - else: - # Continue the beam search and select the top beam_width sequences - # to continue the beam search. - selected_child_seqs.extend(running_child_seqs[:beam_width]) - # The remaining running sequences will not be used in the next - # iteration. Again, if these sequences are continuations of - # parent sequences, we will need to remove the parent sequences - # from the sequence group. - unselected_child_seqs.extend(running_child_seqs[beam_width:]) - - # For newly created child sequences, add them to the sequence group - # and fork them in block manager if they are not finished. - for seq, parent in selected_child_seqs: - if seq is not parent: - seq_group.add(seq) - if not seq.is_finished(): - self.scheduler.fork_seq(parent, seq) - - # Free the finished and selected parent sequences' memory in block - # manager. Keep them in the sequence group as candidate output. - for seq, parent in selected_child_seqs: - if seq is parent and seq.is_finished(): - self.scheduler.free_seq(seq) - - # Remove the unselected parent sequences from the sequence group and - # free their memory in block manager. - for seq, parent in unselected_child_seqs: - if seq is parent: - # Remove the parent sequence if it is not selected for next - # iteration - seq_group.remove(seq.seq_id) - self.scheduler.free_seq(seq) - def _process_model_outputs( self, output: List[SamplerOutput], scheduled_seq_groups: List[SequenceGroup], From c0951f35dbe58048817eb8af3b4efd1e1878778d Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Tue, 16 Apr 2024 20:51:06 -0700 Subject: [PATCH 35/38] [tokenizer] fix integration test --- vllm/engine/output_processor/single_step.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py index 1b7eb014f802b..a7b8e2a70ade7 100644 --- a/vllm/engine/output_processor/single_step.py +++ b/vllm/engine/output_processor/single_step.py @@ -59,7 +59,8 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, # Process prompt logprobs prompt_logprobs = outputs.prompt_logprobs - if prompt_logprobs is not None and seq_group.sampling_params.detokenize: + if prompt_logprobs is not None and \ + seq_group.sampling_params.detokenize and self.detokenizer: self.detokenizer.decode_prompt_logprobs_inplace( seq_group, prompt_logprobs) seq_group.prompt_logprobs = prompt_logprobs From 47dce6efb8583de91646104f5149526bd6d3221b Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Wed, 17 Apr 2024 09:49:38 -0700 Subject: [PATCH 36/38] [tokenizer] merge with main --- vllm/engine/output_processor/single_step.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py index a7b8e2a70ade7..b32937327ba7f 100644 --- a/vllm/engine/output_processor/single_step.py +++ b/vllm/engine/output_processor/single_step.py @@ -106,7 +106,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, child_seqs.append((parent, parent)) for seq, _ in child_seqs: - if seq_group.sampling_params.detokenize: + if seq_group.sampling_params.detokenize and self.detokenizer: new_char_count = self.detokenizer.decode_sequence_inplace( seq, seq_group.sampling_params) else: From 50a7fad76fa9909ba8e05b7dfb27b2313360e56e Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Thu, 18 Apr 2024 10:53:08 -0700 Subject: [PATCH 37/38] [tokenizer] log warning if eos_token_id is None --- vllm/engine/arg_utils.py | 2 +- vllm/engine/llm_engine.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index e0142202fff7e..7ff5cd079b447 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -97,7 +97,7 @@ def add_cli_args( default=EngineArgs.tokenizer, help='name or path of the huggingface tokenizer to use') parser.add_argument( - '--skip_tokenizer_init', + '--skip-tokenizer-init', action='store_true', help='Skip initialization of tokenizer and detokenizer') parser.add_argument( diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index fa12ada7d63ae..83d6c7bcee1dd 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -391,6 +391,9 @@ def add_request( if self.tokenizer: eos_token_id = self.tokenizer.get_lora_tokenizer( lora_request).eos_token_id + else: + logger.warning("Use None for EOS token id because tokenizer is " + "not initialized") seq = Sequence(seq_id, prompt, prompt_token_ids, block_size, eos_token_id, lora_request) From 5f2b8ed1f8cd59ff661fa30b32e4ab97e2398aa2 Mon Sep 17 00:00:00 2001 From: Yun Ding Date: Thu, 18 Apr 2024 13:30:46 -0700 Subject: [PATCH 38/38] [tokenizer] work around mypy errors --- vllm/engine/llm_engine.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 83d6c7bcee1dd..ce6bea269dee2 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -123,6 +123,7 @@ def __init__( self.log_stats = log_stats if not self.model_config.skip_tokenizer_init: + self.tokenizer: BaseTokenizerGroup self._init_tokenizer() self.detokenizer = Detokenizer(self.tokenizer) else: @@ -290,7 +291,7 @@ def _init_tokenizer(self, **tokenizer_init_kwargs): trust_remote_code=self.model_config.trust_remote_code, revision=self.model_config.tokenizer_revision) init_kwargs.update(tokenizer_init_kwargs) - self.tokenizer: BaseTokenizerGroup = get_tokenizer_group( + self.tokenizer = get_tokenizer_group( self.parallel_config.tokenizer_pool_config, **init_kwargs) def _verify_args(self) -> None: