vllm-project · youkaichao · Jul 13, 2024 · Jul 10, 2024 · Jul 10, 2024 · Jul 10, 2024
diff --git a/.buildkite/run-multi-node-test.sh b/.buildkite/run-multi-node-test.sh
@@ -2,16 +2,17 @@
 
 set -euox pipefail
 
-if [[ $# -lt 3 ]]; then
-    echo "Please provide the number of nodes and GPU per node."
+if [[ $# -lt 4 ]]; then
+    echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
     exit 1
 fi
 
-NUM_NODES=$1
-NUM_GPUS=$2
-DOCKER_IMAGE=$3
+WORKING_DIR=$1
+NUM_NODES=$2
+NUM_GPUS=$3
+DOCKER_IMAGE=$4
 
-shift 3
+shift 4
 COMMANDS=("$@")
 if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
     echo "The number of commands must be equal to the number of nodes."
@@ -40,13 +41,40 @@ start_nodes() {
             fi
         done
         GPU_DEVICES+='"'
-        # echo "Starting node$node with GPU devices: $GPU_DEVICES"
-        docker run -d --gpus "$GPU_DEVICES" --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE tail -f /dev/null
+
+        # start the container in detached mode
+        # things to note:
+        # 1. --shm-size=10.24gb is required. don't use --ipc=host
+        # 2. pass HF_TOKEN to the container
+        # 3. map the huggingface cache directory to the container
+        # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
+        #    starting from 192.168.10.11)
+        docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null"
+
+        # organize containers into a ray cluster
+        if [ $node -eq 0 ]; then
+            # start the ray head node
+            docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block"
+            # wait for the head node to be ready
+            sleep 10
+        else
+            # start the ray worker nodes, and connect them to the head node
+            docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
+        fi
     done
+
+    # wait for the cluster to be ready
+    sleep 10
+
+    # print the cluster status
+    docker exec node0 /bin/bash -c "ray status"
 }
 
 run_nodes() {
-    for node in $(seq 0 $(($NUM_NODES-1))); do
+    # important: iterate in reverse order to start the head node last
+    # we start the worker nodes first, in detached mode, and then start the head node
+    # in the foreground, so that the output of the head node is visible in the buildkite logs
+    for node in $(seq $(($NUM_NODES - 1)) -1 0); do
         GPU_DEVICES='"device='
         for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
             DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
@@ -57,10 +85,10 @@ run_nodes() {
         done
         GPU_DEVICES+='"'
         echo "Running node$node with GPU devices: $GPU_DEVICES"
-        if [ $node -lt $(($NUM_NODES - 1)) ]; then
-            docker exec -d node$node /bin/bash -c "${COMMANDS[$node]}"
+        if [ $node -ne 0 ]; then
+            docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
         else
-            docker exec node$node /bin/bash -c "${COMMANDS[$node]}"
+            docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
         fi
     done
 }

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -39,6 +39,17 @@ steps:
   - pytest -v -s distributed/test_comm_ops.py
   - pytest -v -s distributed/test_shm_broadcast.py
 
+- label: 2 Node Tests (4 GPUs in total)
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  num_nodes: 2
+  commands:
+  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
+    - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
+  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
+
 - label: Distributed Tests (2 GPUs)
   mirror_hardwares: [amd]
   working_dir: "/vllm-workspace/tests"
@@ -182,7 +193,10 @@ steps:
 
 - label: Tensorizer Test
   #mirror_hardwares: [amd]
-  command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
+  commands:
+    - apt-get install curl libsodium23
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s tensorizer_loader
 
 - label: Metrics Test
   mirror_hardwares: [amd]

@@ -1,24 +1,14 @@
 import openai  # use the official client for correctness check
 import pytest
-# using Ray for overall ease of process management, parallel requests,
-# and debugging.
-import ray
 
-from ..utils import VLLM_PATH, RemoteOpenAIServer
+from ..utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "facebook/opt-125m"
 
 
 @pytest.fixture(scope="module")
-def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-    yield
-    ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def server(ray_ctx):
+def server():
     return RemoteOpenAIServer([
         "--model",
         MODEL_NAME,

@@ -2,11 +2,8 @@
 
 import openai  # use the official client for correctness check
 import pytest
-# using Ray for overall ease of process management, parallel requests,
-# and debugging.
-import ray
 
-from ..utils import VLLM_PATH, RemoteOpenAIServer
+from ..utils import RemoteOpenAIServer
 
 # downloading lora to test lora requests
 
@@ -21,14 +18,7 @@
 
 
 @pytest.fixture(scope="module")
-def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-    yield
-    ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def server(ray_ctx):
+def server():
     args = [
         "--model",
         MODEL_NAME,
@@ -50,7 +40,7 @@ def server(ray_ctx):
         args += [
             "--enforce-eager",
         ]
-    return RemoteOpenAIServer(args, num_gpus=PP_SIZE * TP_SIZE)
+    return RemoteOpenAIServer(args)
 
 
 @pytest.fixture(scope="module")

@@ -10,3 +10,4 @@
 
 expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
 assert test_result == expected, f"Expected {expected}, got {test_result}"
+print("Same node test passed!")
@@ -6,15 +6,12 @@
 import jsonschema
 import openai  # use the official client for correctness check
 import pytest
-# using Ray for overall ease of process management, parallel requests,
-# and debugging.
-import ray
 import torch
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
 from openai import BadRequestError
 
-from ...utils import VLLM_PATH, RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -76,14 +73,7 @@ def zephyr_lora_files():
 
 
 @pytest.fixture(scope="module")
-def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-    yield
-    ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def server(zephyr_lora_files, ray_ctx):
+def server(zephyr_lora_files):
     return RemoteOpenAIServer([
         "--model",
         MODEL_NAME,

@@ -6,17 +6,14 @@
 import jsonschema
 import openai  # use the official client for correctness check
 import pytest
-# using Ray for overall ease of process management, parallel requests,
-# and debugging.
-import ray
 import requests
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
 from openai import BadRequestError
 
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
-from ...utils import VLLM_PATH, RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -78,14 +75,7 @@ def zephyr_lora_files():
 
 
 @pytest.fixture(scope="module")
-def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-    yield
-    ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def server(zephyr_lora_files, ray_ctx):
+def server(zephyr_lora_files):
     return RemoteOpenAIServer([
         "--model",
         MODEL_NAME,

@@ -3,22 +3,14 @@
 import numpy as np
 import openai
 import pytest
-import ray
 
-from ...utils import VLLM_PATH, RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer
 
 EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
 
 
 @pytest.fixture(scope="module")
-def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-    yield
-    ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def embedding_server(ray_ctx):
+def embedding_server():
     return RemoteOpenAIServer([
         "--model",
         EMBEDDING_MODEL_NAME,

@@ -1,12 +1,9 @@
 import openai  # use the official client for correctness check
 import pytest
-# using Ray for overall ease of process management, parallel requests,
-# and debugging.
-import ray
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
 
-from ...utils import VLLM_PATH, RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -21,14 +18,7 @@ def zephyr_lora_files():
 
 
 @pytest.fixture(scope="module")
-def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-    yield
-    ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def server(zephyr_lora_files, ray_ctx):
+def server(zephyr_lora_files):
     return RemoteOpenAIServer([
         "--model",
         MODEL_NAME,

@@ -3,7 +3,6 @@
 import openai
 import pytest
 import pytest_asyncio
-import ray
 
 from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64
 
@@ -23,14 +22,7 @@
 
 
 @pytest.fixture(scope="module")
-def ray_ctx():
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-    yield
-    ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def server(ray_ctx):
+def server():
     return RemoteOpenAIServer([
         "--model",
         MODEL_NAME,

diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
@@ -6,7 +6,6 @@
 
 import openai
 import pytest
-import ray
 import torch
 from tensorizer import EncryptionParams
 
@@ -22,7 +21,7 @@
                                                          tensorize_vllm_model)
 
 from ..conftest import VllmRunner, cleanup
-from ..utils import VLLM_PATH, RemoteOpenAIServer
+from ..utils import RemoteOpenAIServer
 
 # yapf conflicts with isort for this docstring
 
@@ -220,8 +219,6 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
         json.dumps(model_loader_extra_config),
     ]
 
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
-
     server = RemoteOpenAIServer(openai_args)
     print("Server ready.")
 
@@ -282,7 +279,6 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner,
     base_model.model.llm_engine.model_executor.shutdown()
     del base_model
     cleanup()
-    ray.shutdown()
 
     # load model with two shards and serialize with encryption
     model_path = str(tmp_path / (model_ref + "-%02d.tensors"))
@@ -305,7 +301,6 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner,
     assert os.path.isfile(model_path % 0), "Serialization subprocess failed"
     assert os.path.isfile(model_path % 1), "Serialization subprocess failed"
     cleanup()
-    ray.shutdown()
 
     loaded_vllm_model = vllm_runner(
         model_ref,
Original file line number	Diff line number	Diff line change
Expand Up		@@ -10,3 +10,4 @@

		expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
		assert test_result == expected, f"Expected {expected}, got {test_result}"
		print("Same node test passed!")