vllm-project · youkaichao · Oct 8, 2024 · Oct 7, 2024 · Oct 7, 2024 · Oct 7, 2024
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
@@ -14,7 +14,6 @@
 from vllm.engine.async_timeout import asyncio_timeout
 from vllm.engine.llm_engine import LLMEngine, SchedulerOutputState
 from vllm.engine.metrics_types import StatLoggerBase
-from vllm.entrypoints.llm import BeamSearchSequence
 from vllm.executor.executor_base import ExecutorAsyncBase
 from vllm.executor.gpu_executor import GPUExecutorAsync
 from vllm.executor.ray_utils import initialize_ray_cluster
@@ -32,8 +31,9 @@
 from vllm.sequence import ExecuteModelRequest
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import (collect_from_async_generator, deprecate_kwargs,
-                        get_beam_search_score, random_uuid, weak_bind)
+from vllm.utils import (BeamSearchSequence, collect_from_async_generator,
+                        create_sort_beams_key_function, deprecate_kwargs,
+                        random_uuid, weak_bind)
 
 logger = init_logger(__name__)
 ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
@@ -1052,16 +1052,14 @@ async def beam_search(
         temperature = params.temperature
         length_penalty = params.length_penalty
 
-        def sort_beams_key(x: BeamSearchSequence) -> float:
-            return get_beam_search_score(x.tokens, x.cum_logprob,
-                                         tokenizer.eos_token_id,
-                                         length_penalty)
-
         tokenizer = await self.get_tokenizer()
         tokenizedPrompt = prompt if isinstance(
             prompt, list) else tokenizer.encode(prompt)
         tokenizedLength = len(tokenizedPrompt)
 
+        sort_beams_key = create_sort_beams_key_function(
+            tokenizer, length_penalty=length_penalty)
+
         beam_search_params = SamplingParams(logprobs=2 * beam_width,
                                             max_tokens=1,
                                             temperature=temperature)

diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
@@ -26,7 +26,6 @@
                                          RPCStartupRequest, RPCStartupResponse,
                                          RPCUProfileRequest)
 # yapf: enable
-from vllm.entrypoints.llm import BeamSearchSequence
 from vllm.envs import VLLM_RPC_TIMEOUT
 from vllm.inputs import PromptType, TokensPrompt
 from vllm.logger import init_logger
@@ -36,8 +35,9 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
-from vllm.utils import (collect_from_async_generator, deprecate_kwargs,
-                        get_beam_search_score, random_uuid)
+from vllm.utils import (BeamSearchSequence, collect_from_async_generator,
+                        create_sort_beams_key_function, deprecate_kwargs,
+                        random_uuid)
 
 logger = init_logger(__name__)
 
@@ -449,7 +449,6 @@ async def beam_search(
         prompt: Union[PromptType, List[int]],
         request_id: str,
         params: BeamSearchParams,
-        lora_request: Optional[LoRARequest] = None
     ) -> AsyncGenerator[RequestOutput, None]:
 
         beam_width = params.beam_width
@@ -458,16 +457,14 @@ async def beam_search(
         temperature = params.temperature
         length_penalty = params.length_penalty
 
-        def sort_beams_key(x: BeamSearchSequence) -> float:
-            return get_beam_search_score(x.tokens, x.cum_logprob,
-                                         tokenizer.eos_token_id,
-                                         length_penalty)
-
-        tokenizer = await self.get_tokenizer(lora_request)
+        tokenizer = await self.get_tokenizer(None)
         tokenizedPrompt = prompt if isinstance(
             prompt, list) else tokenizer.encode(prompt)
         tokenizedLength = len(tokenizedPrompt)
 
+        sort_beams_key = create_sort_beams_key_function(
+            tokenizer, length_penalty=length_penalty)
+
         beam_search_params = SamplingParams(logprobs=2 * beam_width,
                                             max_tokens=1,
                                             temperature=temperature)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
@@ -1,7 +1,6 @@
 import itertools
 import warnings
 from contextlib import contextmanager
-from dataclasses import dataclass
 from typing import (Any, ClassVar, Dict, List, Optional, Sequence, Tuple,
                     Union, cast, overload)
 
@@ -28,43 +27,13 @@
                                                get_cached_tokenizer)
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import (Counter, deprecate_kwargs, get_beam_search_score,
-                        is_list_of)
+from vllm.utils import (BeamSearchInstance, BeamSearchOutput,
+                        BeamSearchSequence, Counter, deprecate_kwargs,
+                        get_beam_search_score, is_list_of)
 
 logger = init_logger(__name__)
 
 
-@dataclass
-class BeamSearchSequence:
-    """A sequence for beam search.
-    It keeps track of the tokens and the log probability of the sequence.
-    The text field is optional and will only be filled when the sequence is
-    about to be returned to the user.
-    """
-    # The tokens includes the prompt.
-    tokens: List[int]
-    cum_logprob: float = 0.0
-    text: Optional[str] = None
-
-
-@dataclass
-class BeamSearchOutput:
-    """The output of beam search.
-    It contains the list of the best beam search sequences.
-    The length of the list is equal to the beam width.
-    """
-    sequences: List[BeamSearchSequence]
-
-
-class BeamSearchInstance:
-
-    def __init__(self, prompt_tokens: List[int]):
-        self.beams: List[BeamSearchSequence] = [
-            BeamSearchSequence(tokens=prompt_tokens)
-        ]
-        self.completed: List[BeamSearchSequence] = []
-
-
 class LLM:
     """An LLM for generating texts from given prompts and sampling parameters.
 

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
@@ -237,25 +237,16 @@ async def create_chat_completion(
                 log_tracing_disabled_warning()
 
             if isinstance(sampling_params, BeamSearchParams):
-                if isinstance(self.engine_client, AsyncLLMEngine):
-                    result_generator = self.engine_client.beam_search(
-                        engine_inputs['prompt_token_ids'],
-                        request_id,
-                        sampling_params,
-                    )
-                elif isinstance(self.engine_client, MQLLMEngineClient):
-                    result_generator = self.engine_client.beam_search(
-                        engine_inputs['prompt_token_ids'],
-                        request_id,
-                        sampling_params,
-                        lora_request,
-                    )
-                else:
-                    raise ValueError(
-                        "Beam search in the API server is only supported with"
-                        " AsyncLLMEngine and MQLLMEngineClient. please add "
-                        "`--disable-frontend-multiprocessing` to "
-                        "use beam search.")
+                assert isinstance(self.engine_client,
+                                    (AsyncLLMEngine,
+                                    MQLLMEngineClient)), \
+                    "Beam search is only supported with" \
+                    "AsyncLLMEngine and MQLLMEngineClient."
+                result_generator = self.engine_client.beam_search(
+                    engine_inputs['prompt_token_ids'],
+                    request_id,
+                    sampling_params,
+                )
             else:
                 result_generator = self.engine_client.generate(
                     engine_inputs,

diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
@@ -151,20 +151,16 @@ async def create_completion(
                     log_tracing_disabled_warning()
 
                 if isinstance(sampling_params, BeamSearchParams):
-                    if isinstance(self.engine_client, AsyncLLMEngine):
-                        generator = self.engine_client.beam_search(
-                            prompt_inputs["prompt_token_ids"], request_id_item,
-                            sampling_params)
-                    elif isinstance(self.engine_client, MQLLMEngineClient):
-                        generator = self.engine_client.beam_search(
-                            prompt_inputs["prompt_token_ids"], request_id_item,
-                            sampling_params, lora_request)
-                    else:
-                        raise ValueError(
-                            "Beam search in the API server is only supported"
-                            " with AsyncLLMEngine and MQLLMEngineClient."
-                            " please add `--disable-frontend-multiprocessing`"
-                            " to use beam search.")
+                    assert isinstance(self.engine_client,
+                                    (AsyncLLMEngine,
+                                    MQLLMEngineClient)), \
+                    "Beam search is only supported with" \
+                    "AsyncLLMEngine and MQLLMEngineClient."
+                    generator = self.engine_client.beam_search(
+                        prompt_inputs["prompt_token_ids"],
+                        request_id,
+                        sampling_params,
+                    )
                 else:
                     generator = self.engine_client.generate(
                         {

diff --git a/vllm/utils.py b/vllm/utils.py
@@ -17,6 +17,7 @@
 import warnings
 import weakref
 from asyncio import FIRST_COMPLETED, ensure_future
+from dataclasses import dataclass
 from functools import lru_cache, partial, wraps
 from platform import uname
 from typing import (Any, AsyncGenerator, Awaitable, Callable, Dict, Generic,
@@ -1363,6 +1364,37 @@ def value(self):
         return self._value
 
 
+@dataclass
+class BeamSearchSequence:
+    """A sequence for beam search.
+    It keeps track of the tokens and the log probability of the sequence.
+    The text field is optional and will only be filled when the sequence is
+    about to be returned to the user.
+    """
+    # The tokens includes the prompt.
+    tokens: List[int]
+    cum_logprob: float = 0.0
+    text: Optional[str] = None
+
+
+@dataclass
+class BeamSearchOutput:
+    """The output of beam search.
+    It contains the list of the best beam search sequences.
+    The length of the list is equal to the beam width.
+    """
+    sequences: List[BeamSearchSequence]
+
+
+class BeamSearchInstance:
+
+    def __init__(self, prompt_tokens: List[int]):
+        self.beams: List[BeamSearchSequence] = [
+            BeamSearchSequence(tokens=prompt_tokens)
+        ]
+        self.completed: List[BeamSearchSequence] = []
+
+
 def get_beam_search_score(
     tokens: List[int],
     cumulative_logprob: float,
@@ -1380,3 +1412,12 @@ def get_beam_search_score(
         seq_len -= 1
 
     return cumulative_logprob / (seq_len**length_penalty)
+
+
+def create_sort_beams_key_function(tokenizer, length_penalty):
-def create_sort_beams_key_function(tokenizer, length_penalty):
+def create_sort_beams_key_function(eos_token_id: int, length_penalty: float):
-def create_sort_beams_key_function(tokenizer, length_penalty):
+def create_sort_beams_key_function(eos_token_id: int, length_penalty: float):
+
+    def sort_beams_key(x: BeamSearchSequence) -> float:
+        return get_beam_search_score(x.tokens, x.cum_logprob,
+                                     tokenizer.eos_token_id, length_penalty)
+
+    return sort_beams_key