Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ci] try to add multi-node tests #6280

Merged
merged 29 commits into from
Jul 13, 2024
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 40 additions & 12 deletions .buildkite/run-multi-node-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,17 @@

set -euox pipefail

if [[ $# -lt 3 ]]; then
echo "Please provide the number of nodes and GPU per node."
if [[ $# -lt 4 ]]; then
echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
exit 1
fi

NUM_NODES=$1
NUM_GPUS=$2
DOCKER_IMAGE=$3
WORKING_DIR=$1
NUM_NODES=$2
NUM_GPUS=$3
DOCKER_IMAGE=$4

shift 3
shift 4
COMMANDS=("$@")
if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
echo "The number of commands must be equal to the number of nodes."
Expand Down Expand Up @@ -40,13 +41,40 @@ start_nodes() {
fi
done
GPU_DEVICES+='"'
# echo "Starting node$node with GPU devices: $GPU_DEVICES"
docker run -d --gpus "$GPU_DEVICES" --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE tail -f /dev/null

# start the container in detached mode
# things to note:
# 1. --shm-size=10.24gb is required. don't use --ipc=host
youkaichao marked this conversation as resolved.
Show resolved Hide resolved
# 2. pass HF_TOKEN to the container
# 3. map the huggingface cache directory to the container
# 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
# starting from 192.168.10.11)
docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null"
youkaichao marked this conversation as resolved.
Show resolved Hide resolved

# organize containers into a ray cluster
if [ $node -eq 0 ]; then
# start the ray head node
docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block"
# wait for the head node to be ready
sleep 10
else
# start the ray worker nodes, and connect them to the head node
docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
youkaichao marked this conversation as resolved.
Show resolved Hide resolved
fi
done

# wait for the cluster to be ready
sleep 10

# print the cluster status
docker exec node0 /bin/bash -c "ray status"
}

run_nodes() {
for node in $(seq 0 $(($NUM_NODES-1))); do
# important: iterate in reverse order to start the head node last
# we start the worker nodes first, in detached mode, and then start the head node
# in the foreground, so that the output of the head node is visible in the buildkite logs
for node in $(seq $(($NUM_NODES - 1)) -1 0); do
GPU_DEVICES='"device='
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
Expand All @@ -57,10 +85,10 @@ run_nodes() {
done
GPU_DEVICES+='"'
echo "Running node$node with GPU devices: $GPU_DEVICES"
if [ $node -lt $(($NUM_NODES - 1)) ]; then
docker exec -d node$node /bin/bash -c "${COMMANDS[$node]}"
if [ $node -ne 0 ]; then
docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
else
docker exec node$node /bin/bash -c "${COMMANDS[$node]}"
docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
fi
done
}
Expand Down
16 changes: 15 additions & 1 deletion .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,17 @@ steps:
- pytest -v -s distributed/test_comm_ops.py
- pytest -v -s distributed/test_shm_broadcast.py

- label: 2 Node Tests (4 GPUs in total)
working_dir: "/vllm-workspace/tests"
num_gpus: 2
num_nodes: 2
commands:
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
- TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py

- label: Distributed Tests (2 GPUs)
mirror_hardwares: [amd]
working_dir: "/vllm-workspace/tests"
Expand Down Expand Up @@ -182,7 +193,10 @@ steps:

- label: Tensorizer Test
#mirror_hardwares: [amd]
command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
commands:
- apt-get install curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s tensorizer_loader

- label: Metrics Test
mirror_hardwares: [amd]
Expand Down
14 changes: 2 additions & 12 deletions tests/async_engine/test_openapi_server_ray.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,14 @@
import openai # use the official client for correctness check
import pytest
# using Ray for overall ease of process management, parallel requests,
# and debugging.
import ray

from ..utils import VLLM_PATH, RemoteOpenAIServer
from ..utils import RemoteOpenAIServer

# any model with a chat template should work here
MODEL_NAME = "facebook/opt-125m"


@pytest.fixture(scope="module")
def ray_ctx():
ray.init(runtime_env={"working_dir": VLLM_PATH})
yield
ray.shutdown()


@pytest.fixture(scope="module")
def server(ray_ctx):
def server():
return RemoteOpenAIServer([
"--model",
MODEL_NAME,
Expand Down
16 changes: 3 additions & 13 deletions tests/distributed/test_pipeline_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,8 @@

import openai # use the official client for correctness check
import pytest
# using Ray for overall ease of process management, parallel requests,
# and debugging.
import ray

from ..utils import VLLM_PATH, RemoteOpenAIServer
from ..utils import RemoteOpenAIServer

# downloading lora to test lora requests

Expand All @@ -21,14 +18,7 @@


@pytest.fixture(scope="module")
def ray_ctx():
ray.init(runtime_env={"working_dir": VLLM_PATH})
yield
ray.shutdown()


@pytest.fixture(scope="module")
def server(ray_ctx):
def server():
args = [
"--model",
MODEL_NAME,
Expand All @@ -50,7 +40,7 @@ def server(ray_ctx):
args += [
"--enforce-eager",
]
return RemoteOpenAIServer(args, num_gpus=PP_SIZE * TP_SIZE)
return RemoteOpenAIServer(args)


@pytest.fixture(scope="module")
Expand Down
1 change: 1 addition & 0 deletions tests/distributed/test_same_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@

expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
assert test_result == expected, f"Expected {expected}, got {test_result}"
print("Same node test passed!")
14 changes: 2 additions & 12 deletions tests/entrypoints/openai/test_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,12 @@
import jsonschema
import openai # use the official client for correctness check
import pytest
# using Ray for overall ease of process management, parallel requests,
# and debugging.
import ray
import torch
# downloading lora to test lora requests
from huggingface_hub import snapshot_download
from openai import BadRequestError

from ...utils import VLLM_PATH, RemoteOpenAIServer
from ...utils import RemoteOpenAIServer

# any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
Expand Down Expand Up @@ -76,14 +73,7 @@ def zephyr_lora_files():


@pytest.fixture(scope="module")
def ray_ctx():
ray.init(runtime_env={"working_dir": VLLM_PATH})
yield
ray.shutdown()


@pytest.fixture(scope="module")
def server(zephyr_lora_files, ray_ctx):
def server(zephyr_lora_files):
return RemoteOpenAIServer([
"--model",
MODEL_NAME,
Expand Down
14 changes: 2 additions & 12 deletions tests/entrypoints/openai/test_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,14 @@
import jsonschema
import openai # use the official client for correctness check
import pytest
# using Ray for overall ease of process management, parallel requests,
# and debugging.
import ray
import requests
# downloading lora to test lora requests
from huggingface_hub import snapshot_download
from openai import BadRequestError

from vllm.transformers_utils.tokenizer import get_tokenizer

from ...utils import VLLM_PATH, RemoteOpenAIServer
from ...utils import RemoteOpenAIServer

# any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
Expand Down Expand Up @@ -78,14 +75,7 @@ def zephyr_lora_files():


@pytest.fixture(scope="module")
def ray_ctx():
ray.init(runtime_env={"working_dir": VLLM_PATH})
yield
ray.shutdown()


@pytest.fixture(scope="module")
def server(zephyr_lora_files, ray_ctx):
def server(zephyr_lora_files):
return RemoteOpenAIServer([
"--model",
MODEL_NAME,
Expand Down
12 changes: 2 additions & 10 deletions tests/entrypoints/openai/test_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,14 @@
import numpy as np
import openai
import pytest
import ray

from ...utils import VLLM_PATH, RemoteOpenAIServer
from ...utils import RemoteOpenAIServer

EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"


@pytest.fixture(scope="module")
def ray_ctx():
ray.init(runtime_env={"working_dir": VLLM_PATH})
yield
ray.shutdown()


@pytest.fixture(scope="module")
def embedding_server(ray_ctx):
def embedding_server():
return RemoteOpenAIServer([
"--model",
EMBEDDING_MODEL_NAME,
Expand Down
14 changes: 2 additions & 12 deletions tests/entrypoints/openai/test_models.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
import openai # use the official client for correctness check
import pytest
# using Ray for overall ease of process management, parallel requests,
# and debugging.
import ray
# downloading lora to test lora requests
from huggingface_hub import snapshot_download

from ...utils import VLLM_PATH, RemoteOpenAIServer
from ...utils import RemoteOpenAIServer

# any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
Expand All @@ -21,14 +18,7 @@ def zephyr_lora_files():


@pytest.fixture(scope="module")
def ray_ctx():
ray.init(runtime_env={"working_dir": VLLM_PATH})
yield
ray.shutdown()


@pytest.fixture(scope="module")
def server(zephyr_lora_files, ray_ctx):
def server(zephyr_lora_files):
return RemoteOpenAIServer([
"--model",
MODEL_NAME,
Expand Down
10 changes: 1 addition & 9 deletions tests/entrypoints/openai/test_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import openai
import pytest
import pytest_asyncio
import ray

from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64

Expand All @@ -23,14 +22,7 @@


@pytest.fixture(scope="module")
def ray_ctx():
ray.init(runtime_env={"working_dir": VLLM_PATH})
yield
ray.shutdown()


@pytest.fixture(scope="module")
def server(ray_ctx):
def server():
return RemoteOpenAIServer([
"--model",
MODEL_NAME,
Expand Down
7 changes: 1 addition & 6 deletions tests/tensorizer_loader/test_tensorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

import openai
import pytest
import ray
import torch
from tensorizer import EncryptionParams

Expand All @@ -22,7 +21,7 @@
tensorize_vllm_model)

from ..conftest import VllmRunner, cleanup
from ..utils import VLLM_PATH, RemoteOpenAIServer
from ..utils import RemoteOpenAIServer

# yapf conflicts with isort for this docstring

Expand Down Expand Up @@ -220,8 +219,6 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
json.dumps(model_loader_extra_config),
]

ray.init(runtime_env={"working_dir": VLLM_PATH})

server = RemoteOpenAIServer(openai_args)
print("Server ready.")

Expand Down Expand Up @@ -282,7 +279,6 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner,
base_model.model.llm_engine.model_executor.shutdown()
del base_model
cleanup()
ray.shutdown()

# load model with two shards and serialize with encryption
model_path = str(tmp_path / (model_ref + "-%02d.tensors"))
Expand All @@ -305,7 +301,6 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner,
assert os.path.isfile(model_path % 0), "Serialization subprocess failed"
assert os.path.isfile(model_path % 1), "Serialization subprocess failed"
cleanup()
ray.shutdown()

loaded_vllm_model = vllm_runner(
model_ref,
Expand Down
Loading
Loading