diff --git a/.buildkite/run-multi-node-test.sh b/.buildkite/run-multi-node-test.sh index 0d94b2555f166..7ac4dcc4c786d 100755 --- a/.buildkite/run-multi-node-test.sh +++ b/.buildkite/run-multi-node-test.sh @@ -2,16 +2,17 @@ set -euox pipefail -if [[ $# -lt 3 ]]; then - echo "Please provide the number of nodes and GPU per node." +if [[ $# -lt 4 ]]; then + echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN" exit 1 fi -NUM_NODES=$1 -NUM_GPUS=$2 -DOCKER_IMAGE=$3 +WORKING_DIR=$1 +NUM_NODES=$2 +NUM_GPUS=$3 +DOCKER_IMAGE=$4 -shift 3 +shift 4 COMMANDS=("$@") if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then echo "The number of commands must be equal to the number of nodes." @@ -40,13 +41,40 @@ start_nodes() { fi done GPU_DEVICES+='"' - # echo "Starting node$node with GPU devices: $GPU_DEVICES" - docker run -d --gpus "$GPU_DEVICES" --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE tail -f /dev/null + + # start the container in detached mode + # things to note: + # 1. --shm-size=10.24gb is required. don't use --ipc=host + # 2. pass HF_TOKEN to the container + # 3. map the huggingface cache directory to the container + # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes: + # starting from 192.168.10.11) + docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null" + + # organize containers into a ray cluster + if [ $node -eq 0 ]; then + # start the ray head node + docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block" + # wait for the head node to be ready + sleep 10 + else + # start the ray worker nodes, and connect them to the head node + docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block" + fi done + + # wait for the cluster to be ready + sleep 10 + + # print the cluster status + docker exec node0 /bin/bash -c "ray status" } run_nodes() { - for node in $(seq 0 $(($NUM_NODES-1))); do + # important: iterate in reverse order to start the head node last + # we start the worker nodes first, in detached mode, and then start the head node + # in the foreground, so that the output of the head node is visible in the buildkite logs + for node in $(seq $(($NUM_NODES - 1)) -1 0); do GPU_DEVICES='"device=' for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu)) @@ -57,10 +85,10 @@ run_nodes() { done GPU_DEVICES+='"' echo "Running node$node with GPU devices: $GPU_DEVICES" - if [ $node -lt $(($NUM_NODES - 1)) ]; then - docker exec -d node$node /bin/bash -c "${COMMANDS[$node]}" + if [ $node -ne 0 ]; then + docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}" else - docker exec node$node /bin/bash -c "${COMMANDS[$node]}" + docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}" fi done } diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 9f388b6f8443d..c8f53224b1dcf 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -68,6 +68,17 @@ steps: - pytest -v -s distributed/test_comm_ops.py - pytest -v -s distributed/test_shm_broadcast.py +- label: 2 Node Tests (4 GPUs in total) + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + num_nodes: 2 + commands: + - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py + - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py + - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py + - label: Distributed Tests (2 GPUs) mirror_hardwares: [amd] working_dir: "/vllm-workspace/tests" @@ -213,7 +224,10 @@ steps: - label: Tensorizer Test #mirror_hardwares: [amd] - command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader + commands: + - apt-get install curl libsodium23 + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s tensorizer_loader - label: Metrics Test mirror_hardwares: [amd] diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py index cc05d79e56874..575f8f19b8ebe 100644 --- a/tests/async_engine/test_openapi_server_ray.py +++ b/tests/async_engine/test_openapi_server_ray.py @@ -1,35 +1,26 @@ import openai # use the official client for correctness check import pytest -# using Ray for overall ease of process management, parallel requests, -# and debugging. -import ray -from ..utils import VLLM_PATH, RemoteOpenAIServer +from ..utils import RemoteOpenAIServer # any model with a chat template should work here MODEL_NAME = "facebook/opt-125m" @pytest.fixture(scope="module") -def ray_ctx(): - ray.init(runtime_env={"working_dir": VLLM_PATH}) - yield - ray.shutdown() - - -@pytest.fixture(scope="module") -def server(ray_ctx): - return RemoteOpenAIServer([ - "--model", - MODEL_NAME, - # use half precision for speed and memory savings in CI environment - "--dtype", - "float16", - "--max-model-len", - "2048", - "--enforce-eager", - "--engine-use-ray" - ]) +def server(): + with RemoteOpenAIServer([ + "--model", + MODEL_NAME, + # use half precision for speed and memory savings in CI environment + "--dtype", + "float16", + "--max-model-len", + "2048", + "--enforce-eager", + "--engine-use-ray" + ]) as remote_server: + yield remote_server @pytest.fixture(scope="module") diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 6072a2dd71800..2d9f63795189d 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -2,11 +2,8 @@ import openai # use the official client for correctness check import pytest -# using Ray for overall ease of process management, parallel requests, -# and debugging. -import ray -from ..utils import VLLM_PATH, RemoteOpenAIServer +from ..utils import RemoteOpenAIServer # downloading lora to test lora requests @@ -21,14 +18,7 @@ @pytest.fixture(scope="module") -def ray_ctx(): - ray.init(runtime_env={"working_dir": VLLM_PATH}) - yield - ray.shutdown() - - -@pytest.fixture(scope="module") -def server(ray_ctx): +def server(): args = [ "--model", MODEL_NAME, @@ -50,7 +40,8 @@ def server(ray_ctx): args += [ "--enforce-eager", ] - return RemoteOpenAIServer(args, num_gpus=PP_SIZE * TP_SIZE) + with RemoteOpenAIServer(args) as remote_server: + yield remote_server @pytest.fixture(scope="module") diff --git a/tests/distributed/test_same_node.py b/tests/distributed/test_same_node.py index 2d886eb566d5d..07e84d0ad54cd 100644 --- a/tests/distributed/test_same_node.py +++ b/tests/distributed/test_same_node.py @@ -10,3 +10,4 @@ expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1" assert test_result == expected, f"Expected {expected}, got {test_result}" +print("Same node test passed!") diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index d6df826949949..d370c63c0c7ba 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -6,15 +6,12 @@ import jsonschema import openai # use the official client for correctness check import pytest -# using Ray for overall ease of process management, parallel requests, -# and debugging. -import ray import torch # downloading lora to test lora requests from huggingface_hub import snapshot_download from openai import BadRequestError -from ...utils import VLLM_PATH, RemoteOpenAIServer +from ...utils import RemoteOpenAIServer # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" @@ -29,35 +26,29 @@ def zephyr_lora_files(): @pytest.fixture(scope="module") -def ray_ctx(): - ray.init(runtime_env={"working_dir": VLLM_PATH}) - yield - ray.shutdown() - - -@pytest.fixture(scope="module") -def server(zephyr_lora_files, ray_ctx): - return RemoteOpenAIServer([ - "--model", - MODEL_NAME, - # use half precision for speed and memory savings in CI environment - "--dtype", - "bfloat16", - "--max-model-len", - "8192", - "--enforce-eager", - # lora config below - "--enable-lora", - "--lora-modules", - f"zephyr-lora={zephyr_lora_files}", - f"zephyr-lora2={zephyr_lora_files}", - "--max-lora-rank", - "64", - "--max-cpu-loras", - "2", - "--max-num-seqs", - "128", - ]) +def server(zephyr_lora_files): + with RemoteOpenAIServer([ + "--model", + MODEL_NAME, + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "8192", + "--enforce-eager", + # lora config below + "--enable-lora", + "--lora-modules", + f"zephyr-lora={zephyr_lora_files}", + f"zephyr-lora2={zephyr_lora_files}", + "--max-lora-rank", + "64", + "--max-cpu-loras", + "2", + "--max-num-seqs", + "128", + ]) as remote_server: + yield remote_server @pytest.fixture(scope="module") diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index d222981d2bba7..6e5fdebe786e1 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -6,9 +6,6 @@ import jsonschema import openai # use the official client for correctness check import pytest -# using Ray for overall ease of process management, parallel requests, -# and debugging. -import ray import requests # downloading lora to test lora requests from huggingface_hub import snapshot_download @@ -16,7 +13,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer -from ...utils import VLLM_PATH, RemoteOpenAIServer +from ...utils import RemoteOpenAIServer # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" @@ -31,35 +28,29 @@ def zephyr_lora_files(): @pytest.fixture(scope="module") -def ray_ctx(): - ray.init(runtime_env={"working_dir": VLLM_PATH}) - yield - ray.shutdown() - - -@pytest.fixture(scope="module") -def server(zephyr_lora_files, ray_ctx): - return RemoteOpenAIServer([ - "--model", - MODEL_NAME, - # use half precision for speed and memory savings in CI environment - "--dtype", - "bfloat16", - "--max-model-len", - "8192", - "--enforce-eager", - # lora config below - "--enable-lora", - "--lora-modules", - f"zephyr-lora={zephyr_lora_files}", - f"zephyr-lora2={zephyr_lora_files}", - "--max-lora-rank", - "64", - "--max-cpu-loras", - "2", - "--max-num-seqs", - "128", - ]) +def server(zephyr_lora_files): + with RemoteOpenAIServer([ + "--model", + MODEL_NAME, + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "8192", + "--enforce-eager", + # lora config below + "--enable-lora", + "--lora-modules", + f"zephyr-lora={zephyr_lora_files}", + f"zephyr-lora2={zephyr_lora_files}", + "--max-lora-rank", + "64", + "--max-cpu-loras", + "2", + "--max-num-seqs", + "128", + ]) as remote_server: + yield remote_server @pytest.fixture(scope="module") diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index f8aa1c9143a3b..4a32aadc8c3ae 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -3,33 +3,26 @@ import numpy as np import openai import pytest -import ray -from ...utils import VLLM_PATH, RemoteOpenAIServer +from ...utils import RemoteOpenAIServer EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct" @pytest.fixture(scope="module") -def ray_ctx(): - ray.init(runtime_env={"working_dir": VLLM_PATH}) - yield - ray.shutdown() - - -@pytest.fixture(scope="module") -def embedding_server(ray_ctx): - return RemoteOpenAIServer([ - "--model", - EMBEDDING_MODEL_NAME, - # use half precision for speed and memory savings in CI environment - "--dtype", - "bfloat16", - "--enforce-eager", - "--max-model-len", - "8192", - "--enforce-eager", - ]) +def embedding_server(): + with RemoteOpenAIServer([ + "--model", + EMBEDDING_MODEL_NAME, + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--enforce-eager", + "--max-model-len", + "8192", + "--enforce-eager", + ]) as remote_server: + yield remote_server @pytest.mark.asyncio diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py index 914ef6e19e109..bf63f9a813f2c 100644 --- a/tests/entrypoints/openai/test_models.py +++ b/tests/entrypoints/openai/test_models.py @@ -1,12 +1,9 @@ import openai # use the official client for correctness check import pytest -# using Ray for overall ease of process management, parallel requests, -# and debugging. -import ray # downloading lora to test lora requests from huggingface_hub import snapshot_download -from ...utils import VLLM_PATH, RemoteOpenAIServer +from ...utils import RemoteOpenAIServer # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" @@ -21,35 +18,29 @@ def zephyr_lora_files(): @pytest.fixture(scope="module") -def ray_ctx(): - ray.init(runtime_env={"working_dir": VLLM_PATH}) - yield - ray.shutdown() - - -@pytest.fixture(scope="module") -def server(zephyr_lora_files, ray_ctx): - return RemoteOpenAIServer([ - "--model", - MODEL_NAME, - # use half precision for speed and memory savings in CI environment - "--dtype", - "bfloat16", - "--max-model-len", - "8192", - "--enforce-eager", - # lora config below - "--enable-lora", - "--lora-modules", - f"zephyr-lora={zephyr_lora_files}", - f"zephyr-lora2={zephyr_lora_files}", - "--max-lora-rank", - "64", - "--max-cpu-loras", - "2", - "--max-num-seqs", - "128", - ]) +def server(zephyr_lora_files): + with RemoteOpenAIServer([ + "--model", + MODEL_NAME, + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "8192", + "--enforce-eager", + # lora config below + "--enable-lora", + "--lora-modules", + f"zephyr-lora={zephyr_lora_files}", + f"zephyr-lora2={zephyr_lora_files}", + "--max-lora-rank", + "64", + "--max-cpu-loras", + "2", + "--max-num-seqs", + "128", + ]) as remote_server: + yield remote_server @pytest.fixture(scope="module") diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index b869717608d0f..563b68566bd2c 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -3,7 +3,6 @@ import openai import pytest import pytest_asyncio -import ray from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64 @@ -23,25 +22,19 @@ @pytest.fixture(scope="module") -def ray_ctx(): - ray.init(runtime_env={"working_dir": VLLM_PATH}) - yield - ray.shutdown() - - -@pytest.fixture(scope="module") -def server(ray_ctx): - return RemoteOpenAIServer([ - "--model", - MODEL_NAME, - "--dtype", - "bfloat16", - "--max-model-len", - "4096", - "--enforce-eager", - "--chat-template", - str(LLAVA_CHAT_TEMPLATE), - ]) +def server(): + with RemoteOpenAIServer([ + "--model", + MODEL_NAME, + "--dtype", + "bfloat16", + "--max-model-len", + "4096", + "--enforce-eager", + "--chat-template", + str(LLAVA_CHAT_TEMPLATE), + ]) as remote_server: + yield remote_server @pytest.fixture(scope="module") diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index b2ebcc15cd0fc..a43f9132585b5 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -6,7 +6,6 @@ import openai import pytest -import ray import torch from tensorizer import EncryptionParams @@ -22,7 +21,7 @@ tensorize_vllm_model) from ..conftest import VllmRunner, cleanup -from ..utils import VLLM_PATH, RemoteOpenAIServer +from ..utils import RemoteOpenAIServer # yapf conflicts with isort for this docstring @@ -220,23 +219,21 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path): json.dumps(model_loader_extra_config), ] - ray.init(runtime_env={"working_dir": VLLM_PATH}) + with RemoteOpenAIServer(openai_args) as server: + print("Server ready.") - server = RemoteOpenAIServer(openai_args) - print("Server ready.") + client = server.get_client() + completion = client.completions.create(model=model_ref, + prompt="Hello, my name is", + max_tokens=5, + temperature=0.0) - client = server.get_client() - completion = client.completions.create(model=model_ref, - prompt="Hello, my name is", - max_tokens=5, - temperature=0.0) - - assert completion.id is not None - assert len(completion.choices) == 1 - assert len(completion.choices[0].text) >= 5 - assert completion.choices[0].finish_reason == "length" - assert completion.usage == openai.types.CompletionUsage( - completion_tokens=5, prompt_tokens=6, total_tokens=11) + assert completion.id is not None + assert len(completion.choices) == 1 + assert len(completion.choices[0].text) >= 5 + assert completion.choices[0].finish_reason == "length" + assert completion.usage == openai.types.CompletionUsage( + completion_tokens=5, prompt_tokens=6, total_tokens=11) def test_raise_value_error_on_invalid_load_format(vllm_runner): @@ -282,7 +279,6 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner, base_model.model.llm_engine.model_executor.shutdown() del base_model cleanup() - ray.shutdown() # load model with two shards and serialize with encryption model_path = str(tmp_path / (model_ref + "-%02d.tensors")) @@ -305,7 +301,6 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner, assert os.path.isfile(model_path % 0), "Serialization subprocess failed" assert os.path.isfile(model_path % 1), "Serialization subprocess failed" cleanup() - ray.shutdown() loaded_vllm_model = vllm_runner( model_ref, diff --git a/tests/utils.py b/tests/utils.py index ad4d097b0e8ed..50f723b0b18a3 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -49,53 +49,7 @@ class RemoteOpenAIServer: DUMMY_API_KEY = "token-abc123" # vLLM's OpenAI server does not need API key MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds - class _RemoteRunner: - - def __init__(self, cli_args: List[str], *, wait_url: str, - wait_timeout: float) -> None: - env = os.environ.copy() - env["PYTHONUNBUFFERED"] = "1" - self.proc = subprocess.Popen( - [ - sys.executable, "-m", "vllm.entrypoints.openai.api_server", - *cli_args - ], - env=env, - stdout=sys.stdout, - stderr=sys.stderr, - ) - - self._wait_for_server(url=wait_url, timeout=wait_timeout) - - def ready(self): - return True - - def _wait_for_server(self, *, url: str, timeout: float): - # run health check - start = time.time() - while True: - try: - if requests.get(url).status_code == 200: - break - except Exception as err: - if self.proc.poll() is not None: - raise RuntimeError( - "Server exited unexpectedly.") from err - - time.sleep(0.5) - if time.time() - start > timeout: - raise RuntimeError( - "Server failed to start in time.") from err - - def __del__(self): - if hasattr(self, "proc"): - self.proc.terminate() - - def __init__(self, - cli_args: List[str], - *, - auto_port: bool = True, - num_gpus: int = 1) -> None: + def __init__(self, cli_args: List[str], *, auto_port: bool = True) -> None: if auto_port: if "-p" in cli_args or "--port" in cli_args: raise ValueError("You have manually specified the port" @@ -108,13 +62,41 @@ def __init__(self, self.host = str(args.host or 'localhost') self.port = int(args.port) - self._runner = ray.remote(num_gpus=num_gpus)( - self._RemoteRunner).remote( - cli_args, - wait_url=self.url_for("health"), - wait_timeout=self.MAX_SERVER_START_WAIT_S) - - self._wait_until_ready() + env = os.environ.copy() + # the current process might initialize cuda, + # to be safe, we should use spawn method + env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' + self.proc = subprocess.Popen( + [sys.executable, "-m", "vllm.entrypoints.openai.api_server"] + + cli_args, + env=env, + stdout=sys.stdout, + stderr=sys.stderr) + self._wait_for_server(url=self.url_for("health"), + timeout=self.MAX_SERVER_START_WAIT_S) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.proc.terminate() + + def _wait_for_server(self, *, url: str, timeout: float): + # run health check + start = time.time() + while True: + try: + if requests.get(url).status_code == 200: + break + except Exception as err: + result = self.proc.poll() + if result is not None and result != 0: + raise RuntimeError("Server exited unexpectedly.") from err + + time.sleep(0.5) + if time.time() - start > timeout: + raise RuntimeError( + "Server failed to start in time.") from err @property def url_root(self) -> str: @@ -123,9 +105,6 @@ def url_root(self) -> str: def url_for(self, *parts: str) -> str: return self.url_root + "/" + "/".join(parts) - def _wait_until_ready(self) -> None: - ray.get(self._runner.ready.remote()) - def get_client(self): return openai.OpenAI( base_url=self.url_for("v1"), diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 6e13264aba233..388f934ef75a6 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -224,16 +224,13 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", # broadcasted to. self.non_driver_workers: List[RayWorkerWrapper] = [] - for pp_rank in range(self.parallel_config.pipeline_parallel_size): - for tp_rank in range(self.parallel_config.tensor_parallel_size): - rank = (pp_rank * - self.parallel_config.tensor_parallel_size) + tp_rank - if rank == 0: - pass - elif rank % self.parallel_config.tensor_parallel_size == 0: - self.tp_driver_workers.append(self.workers[rank - 1]) - else: - self.non_driver_workers.append(self.workers[rank - 1]) + for idx, rank in enumerate(worker_ranks[1:]): + # We need to skip the driver worker, which we + # do by skipping worker_ranks[0] which is always 0. + if rank % self.parallel_config.tensor_parallel_size == 0: + self.tp_driver_workers.append(self.workers[idx]) + else: + self.non_driver_workers.append(self.workers[idx]) def _driver_execute_model( self, execute_model_req: Optional[ExecuteModelRequest]