vllm-project · robertgshaw2-redhat · Aug 21, 2024 · Aug 10, 2024 · Aug 10, 2024 · Aug 10, 2024
diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py
@@ -114,5 +114,5 @@ def test_traces(trace_service):
         SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
     e2e_time = metrics.finished_time - metrics.arrival_time
     assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time
-    assert attributes.get(SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER
-                          ) == metrics.scheduler_time
+    assert attributes.get(
+        SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER) == metrics.scheduler_time
diff --git a/vllm/entrypoints/openai/rpc/__init__.py b/vllm/entrypoints/openai/rpc/__init__.py
@@ -9,6 +9,8 @@
 
 VLLM_RPC_SUCCESS_STR = "SUCCESS"
 VLLM_RPC_HEALTHY_STR = "HEALTHY"
+# TODO: figure out if this can be set to inf.
+VLLM_RPC_ZMQ_MAX_SOCKETS = 1000000
 
 
 @dataclass

diff --git a/vllm/entrypoints/openai/rpc/client.py b/vllm/entrypoints/openai/rpc/client.py
@@ -1,5 +1,7 @@
+import asyncio
 from contextlib import contextmanager
 from typing import Any, AsyncGenerator, Mapping, Optional
+from uuid import uuid4
 
 import cloudpickle
 import zmq
@@ -9,8 +11,10 @@
                          ParallelConfig, SchedulerConfig)
 from vllm.entrypoints.openai.rpc import (RPC_REQUEST_TYPE,
                                          VLLM_RPC_HEALTHY_STR,
-                                         VLLM_RPC_SUCCESS_STR, RPCAbortRequest,
-                                         RPCGenerateRequest, RPCUtilityRequest)
+                                         VLLM_RPC_SUCCESS_STR,
+                                         VLLM_RPC_ZMQ_MAX_SOCKETS,
+                                         RPCAbortRequest, RPCGenerateRequest,
+                                         RPCUtilityRequest)
 from vllm.inputs import PromptInputs
 from vllm.lora.request import LoRARequest
 from vllm.outputs import EmbeddingRequestOutput, RequestOutput
@@ -21,12 +25,40 @@
 # Time to wait before checking it the server process is alive.
 SERVER_START_TIMEOUT_MS = 1000
 
+# Inprocess path
+INPROC_PATH = f"inproc://{uuid4()}"
+
 
 class AsyncEngineRPCClient:
 
     def __init__(self, rpc_path: str):
         self.context = zmq.asyncio.Context()
-        self.rpc_path = rpc_path
+        self.context.set(zmq.constants.MAX_SOCKETS, VLLM_RPC_ZMQ_MAX_SOCKETS)
+
+        # PROXY
+        self.from_client = self.context.socket(zmq.constants.ROUTER)
+        self.from_client.bind(INPROC_PATH)
+
+        # Connection to RPC Server.
+        self.to_server = self.context.socket(zmq.constants.DEALER)
+        self.to_server.connect(rpc_path)
+
+        self.proxy_task = asyncio.create_task(
+            self.run_proxy(self.from_client, self.to_server))
+
+    async def run_proxy(self, socket_from, socket_to):
+        poller = zmq.asyncio.Poller()
+        poller.register(socket_from, zmq.constants.POLLIN)
+        poller.register(socket_to, zmq.constants.POLLIN)
+        while True:
+            events = await poller.poll()
+            events = dict(events)
+            if socket_from in events:
+                msg = await socket_from.recv_multipart()
+                await socket_to.send_multipart(msg)
+            elif socket_to in events:
+                msg = await socket_to.recv_multipart()
+                await socket_from.send_multipart(msg)
 
     async def setup(self):
         """Setup the client before it starts sending server requests."""
@@ -62,7 +94,7 @@ def socket(self):
         # to enable streaming.
         socket = self.context.socket(zmq.constants.DEALER)
         try:
-            socket.connect(self.rpc_path)
+            socket.connect(INPROC_PATH)
             yield socket
         finally:
             # linger == 0 means discard unsent messages
@@ -82,9 +114,8 @@ async def _send_get_data_rpc_request(self, request: RPCUtilityRequest,
         """Send an RPC request that is expecting data back."""
 
         with self.socket() as socket:
-
             # Ping RPCServer with a request.
-            await socket.send(cloudpickle.dumps(request))
+            await socket.send_multipart([cloudpickle.dumps(request)])
 
             # Await the data from the Server.
             data = cloudpickle.loads(await socket.recv())
@@ -105,7 +136,7 @@ async def _send_one_way_rpc_request(self,
         """Send one-way RPC request to trigger an action."""
         with self.socket() as socket:
             # Ping RPC Server with request.
-            await socket.send(cloudpickle.dumps(request))
+            await socket.send_multipart([cloudpickle.dumps(request)])
 
             # Await acknowledgement from RPCServer.
             if timeout is not None and await socket.poll(timeout=timeout) == 0:
@@ -269,8 +300,8 @@ async def check_health(self) -> None:
         with self.socket() as socket:
 
             # Ping RPCServer with CHECK_HEALTH request.
-            await socket.send(cloudpickle.dumps(RPCUtilityRequest.CHECK_HEALTH)
-                              )
+            await socket.send_multipart(
+                [cloudpickle.dumps(RPCUtilityRequest.CHECK_HEALTH)])
 
             # Await the reply from the server.
             # TODO: do we need an internal timeout here?

diff --git a/vllm/entrypoints/openai/rpc/server.py b/vllm/entrypoints/openai/rpc/server.py
@@ -9,8 +9,10 @@
 
 from vllm import AsyncEngineArgs, AsyncLLMEngine
 from vllm.entrypoints.openai.rpc import (VLLM_RPC_HEALTHY_STR,
-                                         VLLM_RPC_SUCCESS_STR, RPCAbortRequest,
-                                         RPCGenerateRequest, RPCUtilityRequest)
+                                         VLLM_RPC_SUCCESS_STR,
+                                         VLLM_RPC_ZMQ_MAX_SOCKETS,
+                                         RPCAbortRequest, RPCGenerateRequest,
+                                         RPCUtilityRequest)
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
 
@@ -27,6 +29,7 @@ def __init__(self, async_engine_args: AsyncEngineArgs,
 
         # Initialize context.
         self.context = zmq.asyncio.Context()
+        self.context.set(zmq.constants.MAX_SOCKETS, VLLM_RPC_ZMQ_MAX_SOCKETS)
 
         # Init socket for readiness state.
         self.socket = self.context.socket(zmq.constants.ROUTER)
@@ -37,64 +40,55 @@ def cleanup(self):
         self.socket.close()
         self.context.destroy()
 
-    async def get_model_config(self, identity):
-        """Send the ModelConfig"""
-        model_config = await self.engine.get_model_config()
-
-        await self.socket.send_multipart(
-            [identity, cloudpickle.dumps(model_config)])
-
-    async def get_decoding_config(self, identity):
-        """Send the DecodingConfig"""
-        decoding_config = await self.engine.get_decoding_config()
-
-        await self.socket.send_multipart(
-            [identity, cloudpickle.dumps(decoding_config)])
-
-    async def get_lora_config(self, identity):
-        lora_config = await self.engine.get_lora_config()
-
-        await self.socket.send_multipart(
-            [identity, cloudpickle.dumps(lora_config)])
-
-    async def get_scheduler_config(self, identity):
-        """Send the SchedulerConfig"""
-        parallel_config = await self.engine.get_scheduler_config()
-
-        await self.socket.send_multipart(
-            [identity, cloudpickle.dumps(parallel_config)])
+    async def get_config(self, identity, part2, request):
+        try:
+            if request == RPCUtilityRequest.GET_MODEL_CONFIG:
+                config = await self.engine.get_model_config()
+            elif request == RPCUtilityRequest.GET_DECODING_CONFIG:
+                config = await self.engine.get_decoding_config()
+            elif request == RPCUtilityRequest.GET_LORA_CONFIG:
+                config = await self.engine.get_lora_config()
+            elif request == RPCUtilityRequest.GET_SCHEDULER_CONFIG:
+                config = await self.engine.get_scheduler_config()
+            elif request == RPCUtilityRequest.GET_PARALLEL_CONFIG:
+                config = await self.engine.get_parallel_config()
+            else:
+                raise ValueError("Unknown Config Request: %s", request)
 
-    async def get_parallel_config(self, identity):
-        """Send the ParallelConfig"""
-        parallel_config = await self.engine.get_parallel_config()
+            await self.socket.send_multipart(
+                [identity, part2, cloudpickle.dumps(config)])
 
-        await self.socket.send_multipart(
-            [identity, cloudpickle.dumps(parallel_config)])
+        except Exception as e:
+            ### Notify client of all failures
+            await self.socket.send_multipart(
+                [identity, part2, cloudpickle.dumps(e)])
 
-    async def is_tracing_enabled(self, identity):
+    async def is_tracing_enabled(self, identity, part2):
         """Send the is_tracing_enabled flag"""
         tracing_flag = await self.engine.is_tracing_enabled()
 
         await self.socket.send_multipart(
-            [identity, cloudpickle.dumps(tracing_flag)])
+            [identity, part2, cloudpickle.dumps(tracing_flag)])
 
-    async def do_log_stats(self, identity):
+    async def do_log_stats(self, identity, part2):
         """Log stats and confirm success."""
         await self.engine.do_log_stats()
 
         await self.socket.send_multipart([
             identity,
+            part2,
             cloudpickle.dumps(VLLM_RPC_SUCCESS_STR),
         ])
 
-    async def is_server_ready(self, identity):
+    async def is_server_ready(self, identity, part2):
         """Notify the client that we are ready."""
         await self.socket.send_multipart([
             identity,
+            part2,
             cloudpickle.dumps(VLLM_RPC_SUCCESS_STR),
         ])
 
-    async def abort(self, identity, request: RPCAbortRequest):
+    async def abort(self, identity, part2, request: RPCAbortRequest):
         """Abort request and notify the client of success."""
         try:
             # Abort the request in the llm engine.
@@ -105,10 +99,12 @@ async def abort(self, identity, request: RPCAbortRequest):
             # Send confirmation to the client.
             await self.socket.send_multipart([
                 identity,
+                part2,
                 cloudpickle.dumps(VLLM_RPC_SUCCESS_STR),
             ])
 
-    async def generate(self, identity, generate_request: RPCGenerateRequest):
+    async def generate(self, identity, part2,
+                       generate_request: RPCGenerateRequest):
         try:
             results_generator = self.engine.generate(
                 generate_request.inputs,
@@ -120,51 +116,53 @@ async def generate(self, identity, generate_request: RPCGenerateRequest):
 
             async for request_output in results_generator:
                 await self.socket.send_multipart(
-                    [identity, cloudpickle.dumps(request_output)])
+                    [identity, part2,
+                     cloudpickle.dumps(request_output)])
 
         except Exception as e:
-            ### Notify client of all failures
-            await self.socket.send_multipart([identity, cloudpickle.dumps(e)])
+            await self.socket.send_multipart(
+                [identity, part2, cloudpickle.dumps(e)])
 
-    async def check_health(self, identity):
+    async def check_health(self, identity, part2):
         try:
             await self.engine.check_health()
             await self.socket.send_multipart(
-                [identity, cloudpickle.dumps(VLLM_RPC_HEALTHY_STR)])
+                [identity, part2,
+                 cloudpickle.dumps(VLLM_RPC_HEALTHY_STR)])
+
         except Exception as e:
-            await self.socket.send_multipart([identity, cloudpickle.dumps(e)])
+            await self.socket.send_multipart(
+                [identity, part2, cloudpickle.dumps(e)])
 
-    def _make_handler_coro(self, identity,
+    def _make_handler_coro(self, identity, part2,
                            message) -> Coroutine[Any, Any, Never]:
         """Route the zmq message to the handler coroutine."""
 
         request = cloudpickle.loads(message)
 
         if isinstance(request, RPCGenerateRequest):
-            return self.generate(identity, request)
+            return self.generate(identity, part2, request)
 
         elif isinstance(request, RPCAbortRequest):
-            return self.abort(identity, request)
+            return self.abort(identity, part2, request)
 
         elif isinstance(request, RPCUtilityRequest):
-            if request == RPCUtilityRequest.GET_MODEL_CONFIG:
-                return self.get_model_config(identity)
-            elif request == RPCUtilityRequest.GET_PARALLEL_CONFIG:
-                return self.get_parallel_config(identity)
-            elif request == RPCUtilityRequest.GET_DECODING_CONFIG:
-                return self.get_decoding_config(identity)
-            elif request == RPCUtilityRequest.GET_SCHEDULER_CONFIG:
-                return self.get_scheduler_config(identity)
-            elif request == RPCUtilityRequest.GET_LORA_CONFIG:
-                return self.get_lora_config(identity)
+            if request in [
+                    RPCUtilityRequest.GET_MODEL_CONFIG,
+                    RPCUtilityRequest.GET_PARALLEL_CONFIG,
+                    RPCUtilityRequest.GET_DECODING_CONFIG,
+                    RPCUtilityRequest.GET_SCHEDULER_CONFIG,
+                    RPCUtilityRequest.GET_LORA_CONFIG
+            ]:
+                return self.get_config(identity, part2, request)
             elif request == RPCUtilityRequest.DO_LOG_STATS:
-                return self.do_log_stats(identity)
+                return self.do_log_stats(identity, part2)
             elif request == RPCUtilityRequest.IS_SERVER_READY:
-                return self.is_server_ready(identity)
+                return self.is_server_ready(identity, part2)
             elif request == RPCUtilityRequest.CHECK_HEALTH:
-                return self.check_health(identity)
+                return self.check_health(identity, part2)
             elif request == RPCUtilityRequest.IS_TRACING_ENABLED:
-                return self.is_tracing_enabled(identity)
+                return self.is_tracing_enabled(identity, part2)
             else:
                 raise ValueError(f"Unknown RPCUtilityRequest type: {request}")
 
@@ -177,11 +175,11 @@ async def run_server_loop(self):
         running_tasks = set()
         while True:
             # Wait for a request.
-            identity, message = await self.socket.recv_multipart()
+            identity, part2, message = await self.socket.recv_multipart()
 
             # Process the request async.
             task = asyncio.create_task(
-                self._make_handler_coro(identity, message))
+                self._make_handler_coro(identity, part2, message))
 
             # We need to keep around a strong reference to the task,
             # to avoid the task disappearing mid-execution as running tasks