Address some review comments

vllm-project · Mar 19, 2024 · 8810908 · 8810908
1 parent ebf6967
commit 8810908
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 2 deletions.
diff --git a/vllm/config.py b/vllm/config.py
@@ -18,6 +18,10 @@
 
 _GB = 1 << 30
 
+# A cap on the number of async tokenizer worker pool size when computing
+# based on the number of available CPU cores
+MAX_TOKENIZER_WORKERS = 16
+
 
 class ModelConfig:
     """Configuration for the model.
@@ -437,7 +441,7 @@ def create_config(
         if tokenizer_pool_size is None:
             # Default based on CPU count
             tokenizer_pool_size = min(
-                16,
+                MAX_TOKENIZER_WORKERS,
                 os.cpu_count() - tensor_parallel_size - 1)
             tokenizer_pool_size = max(1, tokenizer_pool_size)
 

diff --git a/vllm/transformers_utils/tokenizer_group/thread_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/thread_tokenizer_group.py
@@ -27,11 +27,14 @@ def init_tokenizer():
             initializer=init_tokenizer,
         )
 
-        self.encode_async = make_async(self._encode_local, self.executor)
+        self._encode_async = make_async(self._encode_local, self.executor)
 
     def _encode_local(self, *args, **kwargs):
         return self.local.tokenizer.encode(*args, **kwargs)
 
     def encode(self, *args, **kwargs):
         return self.executor.submit(self._encode_local, *args,
                                     **kwargs).result()
+
+    async def encode_async(self, *args, **kwargs):
+        return await self._encode_async(*args, **kwargs)