From 31e715abff5cc51cda8c0e3a1ce963daf0b4c055 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 9 Jul 2024 19:58:01 -0700 Subject: [PATCH 01/27] try to add multi-node tests --- .buildkite/test-pipeline.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 8013fbb642bb8..8574812cb6971 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -39,6 +39,15 @@ steps: - pytest -v -s distributed/test_comm_ops.py - pytest -v -s distributed/test_shm_broadcast.py +- label: 2 Node Tests + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + num_nodes: 2 + commands: + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py + + - label: Distributed Tests (2 GPUs) mirror_hardwares: [amd] working_dir: "/vllm-workspace/tests" From e0b9433dbebb36882753e4d7418b4b1770277fbf Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 9 Jul 2024 20:04:51 -0700 Subject: [PATCH 02/27] only one test --- .buildkite/test-pipeline.yaml | 245 ---------------------------------- 1 file changed, 245 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 8574812cb6971..79fb74f52c7bf 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1,44 +1,3 @@ -# In this file, you can add more tests to run either by adding a new step or -# adding a new command to an existing step. See different options here for examples. - -# This script will be feed into Jinja template in `test-template-aws.j2` at -# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 -# to generate the final pipeline yaml file. - - -steps: -- label: Regression Test - mirror_hardwares: [amd] - command: pytest -v -s test_regression.py - working_dir: "/vllm-workspace/tests" # optional - -- label: AsyncEngine Test - #mirror_hardwares: [amd] - command: pytest -v -s async_engine - -- label: Basic Correctness Test - mirror_hardwares: [amd] - commands: - - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py - - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py - - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py - - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py - - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py - -- label: Core Test - mirror_hardwares: [amd] - commands: - - pytest -v -s core - - pytest -v -s distributed/test_parallel_state.py - -- label: Distributed Comm Ops Test - #mirror_hardwares: [amd] - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - commands: - - pytest -v -s distributed/test_comm_ops.py - - pytest -v -s distributed/test_shm_broadcast.py - - label: 2 Node Tests working_dir: "/vllm-workspace/tests" num_gpus: 2 @@ -46,207 +5,3 @@ steps: commands: - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py - - -- label: Distributed Tests (2 GPUs) - mirror_hardwares: [amd] - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - commands: - - bash ../.buildkite/download-images.sh - - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py - - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py - - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py - - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py - - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py - - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py - - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py - - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py - -- label: Distributed Tests (4 GPUs) - #mirror_hardwares: [amd] - working_dir: "/vllm-workspace/tests" - num_gpus: 4 - commands: - - pytest -v -s distributed/test_pynccl.py - # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here. - # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context. - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py - -- label: Pipeline Parallelism Test - working_dir: "/vllm-workspace/tests" - num_gpus: 4 - commands: - - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py - - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py - - TP_SIZE=1 PP_SIZE=3 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py - - PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py - - PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py - - -- label: Engine Test - mirror_hardwares: [amd] - command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py - -- label: Entrypoints Test - mirror_hardwares: [amd] - - commands: - - pytest -v -s entrypoints/llm - - pytest -v -s entrypoints/openai - -- label: Examples Test - working_dir: "/vllm-workspace/examples" - mirror_hardwares: [amd] - commands: - # install aws cli for llava_example.py - # install tensorizer for tensorize_vllm_model.py - - pip install awscli tensorizer - - python3 offline_inference.py - - python3 offline_inference_with_prefix.py - - python3 llm_engine_example.py - - python3 llava_example.py - - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - -- label: Inputs Test - #mirror_hardwares: [amd] - commands: - - bash ../.buildkite/download-images.sh - - pytest -v -s test_inputs.py - - pytest -v -s multimodal - -- label: Kernels Test %N - #mirror_hardwares: [amd] - commands: - - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl - - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - parallelism: 4 - -- label: Models Test - #mirror_hardwares: [amd] - commands: - - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl - - pytest -v -s models -m \"not vlm\" - -- label: Vision Language Models Test - mirror_hardwares: [amd] - commands: - - bash ../.buildkite/download-images.sh - - pytest -v -s models -m vlm - -- label: Prefix Caching Test - mirror_hardwares: [amd] - commands: - - pytest -v -s prefix_caching - -- label: Samplers Test - #mirror_hardwares: [amd] - command: pytest -v -s samplers - -- label: LogitsProcessor Test - mirror_hardwares: [amd] - command: pytest -v -s test_logits_processor.py - -- label: Utils Test - command: pytest -v -s test_utils.py - -- label: Worker Test - mirror_hardwares: [amd] - command: pytest -v -s worker - -- label: Speculative decoding tests - #mirror_hardwares: [amd] - commands: - # See https://github.com/vllm-project/vllm/issues/5152 - - export VLLM_ATTENTION_BACKEND=XFORMERS - - pytest -v -s spec_decode - -- label: LoRA Test %N - #mirror_hardwares: [amd] - command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py - parallelism: 4 - -- label: LoRA Long Context (Distributed) - #mirror_hardwares: [amd] - num_gpus: 4 - # This test runs llama 13B, so it is required to run on 4 GPUs. - commands: - # FIXIT: find out which code initialize cuda before running the test - # before the fix, we need to use spawn to test it - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s -x lora/test_long_context.py - -- label: Tensorizer Test - #mirror_hardwares: [amd] - command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader - -- label: Metrics Test - mirror_hardwares: [amd] - command: pytest -v -s metrics - -- label: Quantization Test - #mirror_hardwares: [amd] - command: pytest -v -s quantization - -- label: Tracing Test - commands: - - "pip install \ - opentelemetry-sdk \ - opentelemetry-api \ - opentelemetry-exporter-otlp \ - opentelemetry-semantic-conventions-ai" - - pytest -v -s tracing - -- label: Benchmarks - working_dir: "/vllm-workspace/.buildkite" - mirror_hardwares: [amd] - commands: - - pip install aiohttp - - bash run-benchmarks.sh - -- label: LM Eval Small Models - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - commands: - - pip install lm-eval - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - bash ./run-tests.sh -c configs/models-small.txt -t 1 - -- label: LM Eval Large Models - gpu: a100 - num_gpus: 4 - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - commands: - - pip install lm-eval - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - bash ./run-tests.sh -c configs/models-large.txt -t 4 - -- label: Documentation Build - working_dir: "/vllm-workspace/test_docs/docs" - no_gpu: True - commands: - - pip install -r requirements-docs.txt - - SPHINXOPTS=\"-W\" make html - -- label: Distributed Tests (A100) - gpu: a100 - num_gpus: 4 - commands: - # NOTE: don't test llama model here, it seems hf implementation is buggy - # see https://github.com/vllm-project/vllm/pull/5689 for details - - pytest -v -s distributed/test_custom_all_reduce.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl - - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - - pytest -v -s -x lora/test_mixtral.py From 30ba3f15aa085c421fb5fb49eea4ccdcfaf7bbed Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 9 Jul 2024 21:03:52 -0700 Subject: [PATCH 03/27] reset --- .buildkite/test-pipeline.yaml | 245 ++++++++++++++++++++++++++++++++++ 1 file changed, 245 insertions(+) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 79fb74f52c7bf..8574812cb6971 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1,3 +1,44 @@ +# In this file, you can add more tests to run either by adding a new step or +# adding a new command to an existing step. See different options here for examples. + +# This script will be feed into Jinja template in `test-template-aws.j2` at +# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 +# to generate the final pipeline yaml file. + + +steps: +- label: Regression Test + mirror_hardwares: [amd] + command: pytest -v -s test_regression.py + working_dir: "/vllm-workspace/tests" # optional + +- label: AsyncEngine Test + #mirror_hardwares: [amd] + command: pytest -v -s async_engine + +- label: Basic Correctness Test + mirror_hardwares: [amd] + commands: + - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py + - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py + - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py + - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py + - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py + +- label: Core Test + mirror_hardwares: [amd] + commands: + - pytest -v -s core + - pytest -v -s distributed/test_parallel_state.py + +- label: Distributed Comm Ops Test + #mirror_hardwares: [amd] + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + commands: + - pytest -v -s distributed/test_comm_ops.py + - pytest -v -s distributed/test_shm_broadcast.py + - label: 2 Node Tests working_dir: "/vllm-workspace/tests" num_gpus: 2 @@ -5,3 +46,207 @@ commands: - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py + + +- label: Distributed Tests (2 GPUs) + mirror_hardwares: [amd] + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + commands: + - bash ../.buildkite/download-images.sh + - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py + - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py + - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py + - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py + - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py + - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py + - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py + - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py + - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py + - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py + +- label: Distributed Tests (4 GPUs) + #mirror_hardwares: [amd] + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + commands: + - pytest -v -s distributed/test_pynccl.py + # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here. + # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context. + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py + - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py + +- label: Pipeline Parallelism Test + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + commands: + - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py + - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py + - TP_SIZE=1 PP_SIZE=3 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py + - PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py + - PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py + + +- label: Engine Test + mirror_hardwares: [amd] + command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py + +- label: Entrypoints Test + mirror_hardwares: [amd] + + commands: + - pytest -v -s entrypoints/llm + - pytest -v -s entrypoints/openai + +- label: Examples Test + working_dir: "/vllm-workspace/examples" + mirror_hardwares: [amd] + commands: + # install aws cli for llava_example.py + # install tensorizer for tensorize_vllm_model.py + - pip install awscli tensorizer + - python3 offline_inference.py + - python3 offline_inference_with_prefix.py + - python3 llm_engine_example.py + - python3 llava_example.py + - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + +- label: Inputs Test + #mirror_hardwares: [amd] + commands: + - bash ../.buildkite/download-images.sh + - pytest -v -s test_inputs.py + - pytest -v -s multimodal + +- label: Kernels Test %N + #mirror_hardwares: [amd] + commands: + - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl + - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 4 + +- label: Models Test + #mirror_hardwares: [amd] + commands: + - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl + - pytest -v -s models -m \"not vlm\" + +- label: Vision Language Models Test + mirror_hardwares: [amd] + commands: + - bash ../.buildkite/download-images.sh + - pytest -v -s models -m vlm + +- label: Prefix Caching Test + mirror_hardwares: [amd] + commands: + - pytest -v -s prefix_caching + +- label: Samplers Test + #mirror_hardwares: [amd] + command: pytest -v -s samplers + +- label: LogitsProcessor Test + mirror_hardwares: [amd] + command: pytest -v -s test_logits_processor.py + +- label: Utils Test + command: pytest -v -s test_utils.py + +- label: Worker Test + mirror_hardwares: [amd] + command: pytest -v -s worker + +- label: Speculative decoding tests + #mirror_hardwares: [amd] + commands: + # See https://github.com/vllm-project/vllm/issues/5152 + - export VLLM_ATTENTION_BACKEND=XFORMERS + - pytest -v -s spec_decode + +- label: LoRA Test %N + #mirror_hardwares: [amd] + command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py + parallelism: 4 + +- label: LoRA Long Context (Distributed) + #mirror_hardwares: [amd] + num_gpus: 4 + # This test runs llama 13B, so it is required to run on 4 GPUs. + commands: + # FIXIT: find out which code initialize cuda before running the test + # before the fix, we need to use spawn to test it + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s -x lora/test_long_context.py + +- label: Tensorizer Test + #mirror_hardwares: [amd] + command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader + +- label: Metrics Test + mirror_hardwares: [amd] + command: pytest -v -s metrics + +- label: Quantization Test + #mirror_hardwares: [amd] + command: pytest -v -s quantization + +- label: Tracing Test + commands: + - "pip install \ + opentelemetry-sdk \ + opentelemetry-api \ + opentelemetry-exporter-otlp \ + opentelemetry-semantic-conventions-ai" + - pytest -v -s tracing + +- label: Benchmarks + working_dir: "/vllm-workspace/.buildkite" + mirror_hardwares: [amd] + commands: + - pip install aiohttp + - bash run-benchmarks.sh + +- label: LM Eval Small Models + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + commands: + - pip install lm-eval + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - bash ./run-tests.sh -c configs/models-small.txt -t 1 + +- label: LM Eval Large Models + gpu: a100 + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + commands: + - pip install lm-eval + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - bash ./run-tests.sh -c configs/models-large.txt -t 4 + +- label: Documentation Build + working_dir: "/vllm-workspace/test_docs/docs" + no_gpu: True + commands: + - pip install -r requirements-docs.txt + - SPHINXOPTS=\"-W\" make html + +- label: Distributed Tests (A100) + gpu: a100 + num_gpus: 4 + commands: + # NOTE: don't test llama model here, it seems hf implementation is buggy + # see https://github.com/vllm-project/vllm/pull/5689 for details + - pytest -v -s distributed/test_custom_all_reduce.py + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py + - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl + - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py + - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py + - pytest -v -s -x lora/test_mixtral.py From 771ff3ba944cbc2b9ebfda344e6b4d9ec5de8fee Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 9 Jul 2024 21:04:14 -0700 Subject: [PATCH 04/27] retry --- .buildkite/test-pipeline.yaml | 235 ---------------------------------- 1 file changed, 235 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 8574812cb6971..7baeef85001d4 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -7,37 +7,6 @@ steps: -- label: Regression Test - mirror_hardwares: [amd] - command: pytest -v -s test_regression.py - working_dir: "/vllm-workspace/tests" # optional - -- label: AsyncEngine Test - #mirror_hardwares: [amd] - command: pytest -v -s async_engine - -- label: Basic Correctness Test - mirror_hardwares: [amd] - commands: - - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py - - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py - - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py - - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py - - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py - -- label: Core Test - mirror_hardwares: [amd] - commands: - - pytest -v -s core - - pytest -v -s distributed/test_parallel_state.py - -- label: Distributed Comm Ops Test - #mirror_hardwares: [amd] - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - commands: - - pytest -v -s distributed/test_comm_ops.py - - pytest -v -s distributed/test_shm_broadcast.py - label: 2 Node Tests working_dir: "/vllm-workspace/tests" @@ -46,207 +15,3 @@ steps: commands: - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py - - -- label: Distributed Tests (2 GPUs) - mirror_hardwares: [amd] - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - commands: - - bash ../.buildkite/download-images.sh - - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py - - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py - - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py - - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py - - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py - - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py - - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py - - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py - -- label: Distributed Tests (4 GPUs) - #mirror_hardwares: [amd] - working_dir: "/vllm-workspace/tests" - num_gpus: 4 - commands: - - pytest -v -s distributed/test_pynccl.py - # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here. - # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context. - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py - -- label: Pipeline Parallelism Test - working_dir: "/vllm-workspace/tests" - num_gpus: 4 - commands: - - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py - - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py - - TP_SIZE=1 PP_SIZE=3 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py - - PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py - - PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py - - -- label: Engine Test - mirror_hardwares: [amd] - command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py - -- label: Entrypoints Test - mirror_hardwares: [amd] - - commands: - - pytest -v -s entrypoints/llm - - pytest -v -s entrypoints/openai - -- label: Examples Test - working_dir: "/vllm-workspace/examples" - mirror_hardwares: [amd] - commands: - # install aws cli for llava_example.py - # install tensorizer for tensorize_vllm_model.py - - pip install awscli tensorizer - - python3 offline_inference.py - - python3 offline_inference_with_prefix.py - - python3 llm_engine_example.py - - python3 llava_example.py - - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - -- label: Inputs Test - #mirror_hardwares: [amd] - commands: - - bash ../.buildkite/download-images.sh - - pytest -v -s test_inputs.py - - pytest -v -s multimodal - -- label: Kernels Test %N - #mirror_hardwares: [amd] - commands: - - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl - - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - parallelism: 4 - -- label: Models Test - #mirror_hardwares: [amd] - commands: - - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl - - pytest -v -s models -m \"not vlm\" - -- label: Vision Language Models Test - mirror_hardwares: [amd] - commands: - - bash ../.buildkite/download-images.sh - - pytest -v -s models -m vlm - -- label: Prefix Caching Test - mirror_hardwares: [amd] - commands: - - pytest -v -s prefix_caching - -- label: Samplers Test - #mirror_hardwares: [amd] - command: pytest -v -s samplers - -- label: LogitsProcessor Test - mirror_hardwares: [amd] - command: pytest -v -s test_logits_processor.py - -- label: Utils Test - command: pytest -v -s test_utils.py - -- label: Worker Test - mirror_hardwares: [amd] - command: pytest -v -s worker - -- label: Speculative decoding tests - #mirror_hardwares: [amd] - commands: - # See https://github.com/vllm-project/vllm/issues/5152 - - export VLLM_ATTENTION_BACKEND=XFORMERS - - pytest -v -s spec_decode - -- label: LoRA Test %N - #mirror_hardwares: [amd] - command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py - parallelism: 4 - -- label: LoRA Long Context (Distributed) - #mirror_hardwares: [amd] - num_gpus: 4 - # This test runs llama 13B, so it is required to run on 4 GPUs. - commands: - # FIXIT: find out which code initialize cuda before running the test - # before the fix, we need to use spawn to test it - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s -x lora/test_long_context.py - -- label: Tensorizer Test - #mirror_hardwares: [amd] - command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader - -- label: Metrics Test - mirror_hardwares: [amd] - command: pytest -v -s metrics - -- label: Quantization Test - #mirror_hardwares: [amd] - command: pytest -v -s quantization - -- label: Tracing Test - commands: - - "pip install \ - opentelemetry-sdk \ - opentelemetry-api \ - opentelemetry-exporter-otlp \ - opentelemetry-semantic-conventions-ai" - - pytest -v -s tracing - -- label: Benchmarks - working_dir: "/vllm-workspace/.buildkite" - mirror_hardwares: [amd] - commands: - - pip install aiohttp - - bash run-benchmarks.sh - -- label: LM Eval Small Models - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - commands: - - pip install lm-eval - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - bash ./run-tests.sh -c configs/models-small.txt -t 1 - -- label: LM Eval Large Models - gpu: a100 - num_gpus: 4 - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - commands: - - pip install lm-eval - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - bash ./run-tests.sh -c configs/models-large.txt -t 4 - -- label: Documentation Build - working_dir: "/vllm-workspace/test_docs/docs" - no_gpu: True - commands: - - pip install -r requirements-docs.txt - - SPHINXOPTS=\"-W\" make html - -- label: Distributed Tests (A100) - gpu: a100 - num_gpus: 4 - commands: - # NOTE: don't test llama model here, it seems hf implementation is buggy - # see https://github.com/vllm-project/vllm/pull/5689 for details - - pytest -v -s distributed/test_custom_all_reduce.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl - - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - - pytest -v -s -x lora/test_mixtral.py From b40e67451fc7a170316b37e31ff026bd07da338e Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 9 Jul 2024 21:08:11 -0700 Subject: [PATCH 05/27] retry --- .buildkite/test-pipeline.yaml | 235 ++++++++++++++++++++++++++++++++++ 1 file changed, 235 insertions(+) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 7baeef85001d4..8574812cb6971 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -7,6 +7,37 @@ steps: +- label: Regression Test + mirror_hardwares: [amd] + command: pytest -v -s test_regression.py + working_dir: "/vllm-workspace/tests" # optional + +- label: AsyncEngine Test + #mirror_hardwares: [amd] + command: pytest -v -s async_engine + +- label: Basic Correctness Test + mirror_hardwares: [amd] + commands: + - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py + - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py + - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py + - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py + - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py + +- label: Core Test + mirror_hardwares: [amd] + commands: + - pytest -v -s core + - pytest -v -s distributed/test_parallel_state.py + +- label: Distributed Comm Ops Test + #mirror_hardwares: [amd] + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + commands: + - pytest -v -s distributed/test_comm_ops.py + - pytest -v -s distributed/test_shm_broadcast.py - label: 2 Node Tests working_dir: "/vllm-workspace/tests" @@ -15,3 +46,207 @@ steps: commands: - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py + + +- label: Distributed Tests (2 GPUs) + mirror_hardwares: [amd] + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + commands: + - bash ../.buildkite/download-images.sh + - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py + - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py + - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py + - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py + - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py + - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py + - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py + - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py + - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py + - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py + +- label: Distributed Tests (4 GPUs) + #mirror_hardwares: [amd] + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + commands: + - pytest -v -s distributed/test_pynccl.py + # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here. + # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context. + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py + - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py + +- label: Pipeline Parallelism Test + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + commands: + - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py + - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py + - TP_SIZE=1 PP_SIZE=3 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py + - PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py + - PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py + + +- label: Engine Test + mirror_hardwares: [amd] + command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py + +- label: Entrypoints Test + mirror_hardwares: [amd] + + commands: + - pytest -v -s entrypoints/llm + - pytest -v -s entrypoints/openai + +- label: Examples Test + working_dir: "/vllm-workspace/examples" + mirror_hardwares: [amd] + commands: + # install aws cli for llava_example.py + # install tensorizer for tensorize_vllm_model.py + - pip install awscli tensorizer + - python3 offline_inference.py + - python3 offline_inference_with_prefix.py + - python3 llm_engine_example.py + - python3 llava_example.py + - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + +- label: Inputs Test + #mirror_hardwares: [amd] + commands: + - bash ../.buildkite/download-images.sh + - pytest -v -s test_inputs.py + - pytest -v -s multimodal + +- label: Kernels Test %N + #mirror_hardwares: [amd] + commands: + - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl + - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 4 + +- label: Models Test + #mirror_hardwares: [amd] + commands: + - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl + - pytest -v -s models -m \"not vlm\" + +- label: Vision Language Models Test + mirror_hardwares: [amd] + commands: + - bash ../.buildkite/download-images.sh + - pytest -v -s models -m vlm + +- label: Prefix Caching Test + mirror_hardwares: [amd] + commands: + - pytest -v -s prefix_caching + +- label: Samplers Test + #mirror_hardwares: [amd] + command: pytest -v -s samplers + +- label: LogitsProcessor Test + mirror_hardwares: [amd] + command: pytest -v -s test_logits_processor.py + +- label: Utils Test + command: pytest -v -s test_utils.py + +- label: Worker Test + mirror_hardwares: [amd] + command: pytest -v -s worker + +- label: Speculative decoding tests + #mirror_hardwares: [amd] + commands: + # See https://github.com/vllm-project/vllm/issues/5152 + - export VLLM_ATTENTION_BACKEND=XFORMERS + - pytest -v -s spec_decode + +- label: LoRA Test %N + #mirror_hardwares: [amd] + command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py + parallelism: 4 + +- label: LoRA Long Context (Distributed) + #mirror_hardwares: [amd] + num_gpus: 4 + # This test runs llama 13B, so it is required to run on 4 GPUs. + commands: + # FIXIT: find out which code initialize cuda before running the test + # before the fix, we need to use spawn to test it + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s -x lora/test_long_context.py + +- label: Tensorizer Test + #mirror_hardwares: [amd] + command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader + +- label: Metrics Test + mirror_hardwares: [amd] + command: pytest -v -s metrics + +- label: Quantization Test + #mirror_hardwares: [amd] + command: pytest -v -s quantization + +- label: Tracing Test + commands: + - "pip install \ + opentelemetry-sdk \ + opentelemetry-api \ + opentelemetry-exporter-otlp \ + opentelemetry-semantic-conventions-ai" + - pytest -v -s tracing + +- label: Benchmarks + working_dir: "/vllm-workspace/.buildkite" + mirror_hardwares: [amd] + commands: + - pip install aiohttp + - bash run-benchmarks.sh + +- label: LM Eval Small Models + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + commands: + - pip install lm-eval + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - bash ./run-tests.sh -c configs/models-small.txt -t 1 + +- label: LM Eval Large Models + gpu: a100 + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + commands: + - pip install lm-eval + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - bash ./run-tests.sh -c configs/models-large.txt -t 4 + +- label: Documentation Build + working_dir: "/vllm-workspace/test_docs/docs" + no_gpu: True + commands: + - pip install -r requirements-docs.txt + - SPHINXOPTS=\"-W\" make html + +- label: Distributed Tests (A100) + gpu: a100 + num_gpus: 4 + commands: + # NOTE: don't test llama model here, it seems hf implementation is buggy + # see https://github.com/vllm-project/vllm/pull/5689 for details + - pytest -v -s distributed/test_custom_all_reduce.py + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py + - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl + - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py + - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py + - pytest -v -s -x lora/test_mixtral.py From e2e87404f4713e0417e3777226ccf805bc46e613 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 9 Jul 2024 22:17:53 -0700 Subject: [PATCH 06/27] list of list --- .buildkite/test-pipeline.yaml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 8574812cb6971..60fc395302e74 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -44,9 +44,12 @@ steps: num_gpus: 2 num_nodes: 2 commands: - - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py - - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py - + - # the following commands are for the first node, with ip 192.168.10.10 + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py + - ray start --head + - # the following commands are for the second node, with ip 192.168.10.11 + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py + - ray start --address=="192.168.10.10:6379" - label: Distributed Tests (2 GPUs) mirror_hardwares: [amd] From 03f296509f2c7fd783ef63f2381ddf8aa41fe657 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 9 Jul 2024 22:19:51 -0700 Subject: [PATCH 07/27] update run node --- .buildkite/run-multi-node-test.sh | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/.buildkite/run-multi-node-test.sh b/.buildkite/run-multi-node-test.sh index 0d94b2555f166..7423b8cbb128f 100755 --- a/.buildkite/run-multi-node-test.sh +++ b/.buildkite/run-multi-node-test.sh @@ -2,16 +2,17 @@ set -euox pipefail -if [[ $# -lt 3 ]]; then - echo "Please provide the number of nodes and GPU per node." +if [[ $# -lt 4 ]]; then + echo "Please provide the working directory, number of nodes and GPU per node." exit 1 fi -NUM_NODES=$1 -NUM_GPUS=$2 -DOCKER_IMAGE=$3 +WORKING_DIR=$1 +NUM_NODES=$2 +NUM_GPUS=$3 +DOCKER_IMAGE=$4 -shift 3 +shift 4 COMMANDS=("$@") if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then echo "The number of commands must be equal to the number of nodes." @@ -58,9 +59,9 @@ run_nodes() { GPU_DEVICES+='"' echo "Running node$node with GPU devices: $GPU_DEVICES" if [ $node -lt $(($NUM_NODES - 1)) ]; then - docker exec -d node$node /bin/bash -c "${COMMANDS[$node]}" + docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}" else - docker exec node$node /bin/bash -c "${COMMANDS[$node]}" + docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}" fi done } From 04f0e28820bb318ba94e398f93e6feb33bab944a Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 9 Jul 2024 22:37:52 -0700 Subject: [PATCH 08/27] add pp test --- .buildkite/test-pipeline.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 60fc395302e74..33564d42ec9f9 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -39,7 +39,7 @@ steps: - pytest -v -s distributed/test_comm_ops.py - pytest -v -s distributed/test_shm_broadcast.py -- label: 2 Node Tests +- label: 2 Node Tests (4 GPUs in total) working_dir: "/vllm-workspace/tests" num_gpus: 2 num_nodes: 2 @@ -47,6 +47,7 @@ steps: - # the following commands are for the first node, with ip 192.168.10.10 - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py - ray start --head + - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py - # the following commands are for the second node, with ip 192.168.10.11 - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py - ray start --address=="192.168.10.10:6379" From b1ab8eb2796e51dcc53f70d830ff1f496034f82c Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 9 Jul 2024 23:07:46 -0700 Subject: [PATCH 09/27] add sleep --- .buildkite/test-pipeline.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 33564d42ec9f9..1cfa69fede12f 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -47,9 +47,11 @@ steps: - # the following commands are for the first node, with ip 192.168.10.10 - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py - ray start --head + - sleep 20 - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py - # the following commands are for the second node, with ip 192.168.10.11 - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py + - sleep 10 - ray start --address=="192.168.10.10:6379" - label: Distributed Tests (2 GPUs) From 80c9f6146880f30f229f974ba04f0960c658c11f Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 9 Jul 2024 23:16:50 -0700 Subject: [PATCH 10/27] adjust commands --- .buildkite/run-multi-node-test.sh | 8 ++++++-- .buildkite/test-pipeline.yaml | 4 ---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.buildkite/run-multi-node-test.sh b/.buildkite/run-multi-node-test.sh index 7423b8cbb128f..3143ff2d59866 100755 --- a/.buildkite/run-multi-node-test.sh +++ b/.buildkite/run-multi-node-test.sh @@ -41,8 +41,12 @@ start_nodes() { fi done GPU_DEVICES+='"' + if [ $node -eq 0 ]; then + docker run -d --gpus "$GPU_DEVICES" --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "ray start --head && tail -f /dev/null" + else + docker run -d --gpus "$GPU_DEVICES" --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "ray start --address=192.168.10.10:6379 && tail -f /dev/null" + fi # echo "Starting node$node with GPU devices: $GPU_DEVICES" - docker run -d --gpus "$GPU_DEVICES" --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE tail -f /dev/null done } @@ -58,7 +62,7 @@ run_nodes() { done GPU_DEVICES+='"' echo "Running node$node with GPU devices: $GPU_DEVICES" - if [ $node -lt $(($NUM_NODES - 1)) ]; then + if [ $node -ne 0 ]; then docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}" else docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}" diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 1cfa69fede12f..ebbc01d03cfcd 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -46,13 +46,9 @@ steps: commands: - # the following commands are for the first node, with ip 192.168.10.10 - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py - - ray start --head - - sleep 20 - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py - # the following commands are for the second node, with ip 192.168.10.11 - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py - - sleep 10 - - ray start --address=="192.168.10.10:6379" - label: Distributed Tests (2 GPUs) mirror_hardwares: [amd] From 5b1d5af6f11f1eb30fbdf87c079ced2765c90a9c Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 9 Jul 2024 23:17:14 -0700 Subject: [PATCH 11/27] add some logging --- tests/distributed/test_same_node.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/distributed/test_same_node.py b/tests/distributed/test_same_node.py index 2d886eb566d5d..07e84d0ad54cd 100644 --- a/tests/distributed/test_same_node.py +++ b/tests/distributed/test_same_node.py @@ -10,3 +10,4 @@ expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1" assert test_result == expected, f"Expected {expected}, got {test_result}" +print("Same node test passed!") From 35e3ee847964ad1fa2e3652acc303d2ea6a6e80b Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 10 Jul 2024 00:16:26 -0700 Subject: [PATCH 12/27] reverse iteration --- .buildkite/run-multi-node-test.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.buildkite/run-multi-node-test.sh b/.buildkite/run-multi-node-test.sh index 3143ff2d59866..20d9ce32e8d3e 100755 --- a/.buildkite/run-multi-node-test.sh +++ b/.buildkite/run-multi-node-test.sh @@ -51,7 +51,8 @@ start_nodes() { } run_nodes() { - for node in $(seq 0 $(($NUM_NODES-1))); do + # important: iterate in reverse order to start the head node last + for node in $(seq $(($NUM_NODES - 1)) -1 0); do GPU_DEVICES='"device=' for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu)) From 32e77112bbeed4dd004ec4f3acf29c92df360155 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 10 Jul 2024 09:18:39 -0700 Subject: [PATCH 13/27] add hf cache dir --- .buildkite/run-multi-node-test.sh | 5 +++-- .buildkite/test-pipeline.yaml | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.buildkite/run-multi-node-test.sh b/.buildkite/run-multi-node-test.sh index 20d9ce32e8d3e..30f98fa1a1280 100755 --- a/.buildkite/run-multi-node-test.sh +++ b/.buildkite/run-multi-node-test.sh @@ -41,10 +41,11 @@ start_nodes() { fi done GPU_DEVICES+='"' + docker run -d --gpus "$GPU_DEVICES" --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE -v /root/.cache/huggingface:/root/.cache/huggingface /bin/bash -c "tail -f /dev/null" if [ $node -eq 0 ]; then - docker run -d --gpus "$GPU_DEVICES" --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "ray start --head && tail -f /dev/null" + docker exec node$node /bin/bash -c "ray start --head --port=6379 && ray status" else - docker run -d --gpus "$GPU_DEVICES" --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "ray start --address=192.168.10.10:6379 && tail -f /dev/null" + docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 && ray status" fi # echo "Starting node$node with GPU devices: $GPU_DEVICES" done diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index ebbc01d03cfcd..bfb3a7a9e1391 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -44,10 +44,10 @@ steps: num_gpus: 2 num_nodes: 2 commands: - - # the following commands are for the first node, with ip 192.168.10.10 + - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py - - # the following commands are for the second node, with ip 192.168.10.11 + - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py - label: Distributed Tests (2 GPUs) From 9ce8c8c4ea18bb56979c46fd2ef1b9f424607b78 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 10 Jul 2024 09:21:31 -0700 Subject: [PATCH 14/27] add hf cache dir --- .buildkite/run-multi-node-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/run-multi-node-test.sh b/.buildkite/run-multi-node-test.sh index 30f98fa1a1280..2e9370d4be705 100755 --- a/.buildkite/run-multi-node-test.sh +++ b/.buildkite/run-multi-node-test.sh @@ -41,7 +41,7 @@ start_nodes() { fi done GPU_DEVICES+='"' - docker run -d --gpus "$GPU_DEVICES" --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE -v /root/.cache/huggingface:/root/.cache/huggingface /bin/bash -c "tail -f /dev/null" + docker run -d --gpus "$GPU_DEVICES" --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE -v ~/.cache/huggingface:/root/.cache/huggingface /bin/bash -c "tail -f /dev/null" if [ $node -eq 0 ]; then docker exec node$node /bin/bash -c "ray start --head --port=6379 && ray status" else From 6d2b78886d9c867d8b4e32c4e281b06514f72979 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 10 Jul 2024 09:37:08 -0700 Subject: [PATCH 15/27] fix docker commands --- .buildkite/run-multi-node-test.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/run-multi-node-test.sh b/.buildkite/run-multi-node-test.sh index 2e9370d4be705..dfd99d2960aac 100755 --- a/.buildkite/run-multi-node-test.sh +++ b/.buildkite/run-multi-node-test.sh @@ -41,11 +41,11 @@ start_nodes() { fi done GPU_DEVICES+='"' - docker run -d --gpus "$GPU_DEVICES" --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE -v ~/.cache/huggingface:/root/.cache/huggingface /bin/bash -c "tail -f /dev/null" + docker run -d --gpus "$GPU_DEVICES" -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null" if [ $node -eq 0 ]; then docker exec node$node /bin/bash -c "ray start --head --port=6379 && ray status" else - docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 && ray status" + docker exec node$node /bin/bash -c "ray start --address=192.168.10.10:6379 && ray status" fi # echo "Starting node$node with GPU devices: $GPU_DEVICES" done From f47f86e268f5b432136d058bc446462dcaaba923 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 10 Jul 2024 09:51:32 -0700 Subject: [PATCH 16/27] fix docker commands --- .buildkite/run-multi-node-test.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/run-multi-node-test.sh b/.buildkite/run-multi-node-test.sh index dfd99d2960aac..5a048f84796db 100755 --- a/.buildkite/run-multi-node-test.sh +++ b/.buildkite/run-multi-node-test.sh @@ -41,9 +41,9 @@ start_nodes() { fi done GPU_DEVICES+='"' - docker run -d --gpus "$GPU_DEVICES" -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null" + docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null" if [ $node -eq 0 ]; then - docker exec node$node /bin/bash -c "ray start --head --port=6379 && ray status" + docker exec node$node /bin/bash -c "ray start --head --port=6379 && sleep 10 && ray status" else docker exec node$node /bin/bash -c "ray start --address=192.168.10.10:6379 && ray status" fi From 71ceb35cabdd3865f4d6a46dbb04bb8ee8016e80 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 10 Jul 2024 10:52:05 -0700 Subject: [PATCH 17/27] add block --- .buildkite/run-multi-node-test.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.buildkite/run-multi-node-test.sh b/.buildkite/run-multi-node-test.sh index 5a048f84796db..53bc37732f2e0 100755 --- a/.buildkite/run-multi-node-test.sh +++ b/.buildkite/run-multi-node-test.sh @@ -41,14 +41,15 @@ start_nodes() { fi done GPU_DEVICES+='"' - docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null" + docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "rm -rf /vllm-workspace/.git && tail -f /dev/null" if [ $node -eq 0 ]; then - docker exec node$node /bin/bash -c "ray start --head --port=6379 && sleep 10 && ray status" + docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block" else - docker exec node$node /bin/bash -c "ray start --address=192.168.10.10:6379 && ray status" + docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block" fi # echo "Starting node$node with GPU devices: $GPU_DEVICES" done + docker exec node0 /bin/bash -c "sleep 10 && ray status" } run_nodes() { From 9c3c821e6fb50d814c1ddb587dcc029b0890c875 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 11 Jul 2024 15:54:09 -0700 Subject: [PATCH 18/27] fix actor gpus --- tests/distributed/test_pipeline_parallel.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 6072a2dd71800..364e079459e65 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -50,7 +50,10 @@ def server(ray_ctx): args += [ "--enforce-eager", ] - return RemoteOpenAIServer(args, num_gpus=PP_SIZE * TP_SIZE) + # NOTE: ray cannot allocate one actor with multiple GPUs across nodes. + # we just tell ray to use 0 GPUs here, so that the actor can be launched. + # the process itself will launch the server with the correct number of GPUs. + return RemoteOpenAIServer(args, num_gpus=0) @pytest.fixture(scope="module") From 196b94bf31bedc0a6eb31c07492bed82f6720aa9 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 11 Jul 2024 16:53:15 -0700 Subject: [PATCH 19/27] remove ray for test api server --- tests/distributed/test_pipeline_parallel.py | 10 +-- tests/utils.py | 85 ++++++++------------- 2 files changed, 33 insertions(+), 62 deletions(-) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 364e079459e65..4dd3de9167758 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -19,16 +19,8 @@ pytestmark = pytest.mark.asyncio - -@pytest.fixture(scope="module") -def ray_ctx(): - ray.init(runtime_env={"working_dir": VLLM_PATH}) - yield - ray.shutdown() - - @pytest.fixture(scope="module") -def server(ray_ctx): +def server(): args = [ "--model", MODEL_NAME, diff --git a/tests/utils.py b/tests/utils.py index ad4d097b0e8ed..194d1c0a604ab 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,9 +1,10 @@ import os -import subprocess import sys import time import warnings +import weakref from contextlib import contextmanager +from multiprocessing import get_context from pathlib import Path from typing import Any, Dict, List @@ -45,52 +46,17 @@ def _nvml(): """Path to root of the vLLM repository.""" +def api_sever_runner(cli_args: List[str]) -> None: + os.environ["PYTHONUNBUFFERED"] = "1" + sys.argv = ["vllm.entrypoints.openai.api_server"] + cli_args + import runpy + runpy.run_module("vllm.entrypoints.openai.api_server", run_name="__main__") + + class RemoteOpenAIServer: DUMMY_API_KEY = "token-abc123" # vLLM's OpenAI server does not need API key MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds - class _RemoteRunner: - - def __init__(self, cli_args: List[str], *, wait_url: str, - wait_timeout: float) -> None: - env = os.environ.copy() - env["PYTHONUNBUFFERED"] = "1" - self.proc = subprocess.Popen( - [ - sys.executable, "-m", "vllm.entrypoints.openai.api_server", - *cli_args - ], - env=env, - stdout=sys.stdout, - stderr=sys.stderr, - ) - - self._wait_for_server(url=wait_url, timeout=wait_timeout) - - def ready(self): - return True - - def _wait_for_server(self, *, url: str, timeout: float): - # run health check - start = time.time() - while True: - try: - if requests.get(url).status_code == 200: - break - except Exception as err: - if self.proc.poll() is not None: - raise RuntimeError( - "Server exited unexpectedly.") from err - - time.sleep(0.5) - if time.time() - start > timeout: - raise RuntimeError( - "Server failed to start in time.") from err - - def __del__(self): - if hasattr(self, "proc"): - self.proc.terminate() - def __init__(self, cli_args: List[str], *, @@ -108,13 +74,29 @@ def __init__(self, self.host = str(args.host or 'localhost') self.port = int(args.port) - self._runner = ray.remote(num_gpus=num_gpus)( - self._RemoteRunner).remote( - cli_args, - wait_url=self.url_for("health"), - wait_timeout=self.MAX_SERVER_START_WAIT_S) - - self._wait_until_ready() + self.proc = get_context("fork").Process(target=api_sever_runner, + args=(cli_args, )) + self.proc.start() + self._wait_for_server(url=self.url_for("health"), + timeout=self.MAX_SERVER_START_WAIT_S) + + weakref.finalize(self, self.proc.terminate) + + def _wait_for_server(self, *, url: str, timeout: float): + # run health check + start = time.time() + while True: + try: + if requests.get(url).status_code == 200: + break + except Exception as err: + if self.proc.exitcode is not None and self.proc.exitcode != 0: + raise RuntimeError("Server exited unexpectedly.") from err + + time.sleep(0.5) + if time.time() - start > timeout: + raise RuntimeError( + "Server failed to start in time.") from err @property def url_root(self) -> str: @@ -123,9 +105,6 @@ def url_root(self) -> str: def url_for(self, *parts: str) -> str: return self.url_root + "/" + "/".join(parts) - def _wait_until_ready(self) -> None: - ray.get(self._runner.ready.remote()) - def get_client(self): return openai.OpenAI( base_url=self.url_for("v1"), From ae34bc24d7aba747b4459bc93d38f71cdf5d2967 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 11 Jul 2024 16:58:41 -0700 Subject: [PATCH 20/27] remove ray for launching api server --- tests/async_engine/test_openapi_server_ray.py | 14 ++------------ tests/distributed/test_pipeline_parallel.py | 6 ++---- tests/entrypoints/openai/test_chat.py | 14 ++------------ tests/entrypoints/openai/test_completion.py | 14 ++------------ tests/entrypoints/openai/test_embedding.py | 12 ++---------- tests/entrypoints/openai/test_models.py | 14 ++------------ tests/entrypoints/openai/test_vision.py | 10 +--------- tests/tensorizer_loader/test_tensorizer.py | 7 +------ 8 files changed, 14 insertions(+), 77 deletions(-) diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py index cc05d79e56874..26d4e6a359644 100644 --- a/tests/async_engine/test_openapi_server_ray.py +++ b/tests/async_engine/test_openapi_server_ray.py @@ -1,24 +1,14 @@ import openai # use the official client for correctness check import pytest -# using Ray for overall ease of process management, parallel requests, -# and debugging. -import ray -from ..utils import VLLM_PATH, RemoteOpenAIServer +from ..utils import RemoteOpenAIServer # any model with a chat template should work here MODEL_NAME = "facebook/opt-125m" @pytest.fixture(scope="module") -def ray_ctx(): - ray.init(runtime_env={"working_dir": VLLM_PATH}) - yield - ray.shutdown() - - -@pytest.fixture(scope="module") -def server(ray_ctx): +def server(): return RemoteOpenAIServer([ "--model", MODEL_NAME, diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 4dd3de9167758..1014174f5d2bf 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -2,11 +2,8 @@ import openai # use the official client for correctness check import pytest -# using Ray for overall ease of process management, parallel requests, -# and debugging. -import ray -from ..utils import VLLM_PATH, RemoteOpenAIServer +from ..utils import RemoteOpenAIServer # downloading lora to test lora requests @@ -19,6 +16,7 @@ pytestmark = pytest.mark.asyncio + @pytest.fixture(scope="module") def server(): args = [ diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index 3e80214f24dc5..f901031778b59 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -6,15 +6,12 @@ import jsonschema import openai # use the official client for correctness check import pytest -# using Ray for overall ease of process management, parallel requests, -# and debugging. -import ray import torch # downloading lora to test lora requests from huggingface_hub import snapshot_download from openai import BadRequestError -from ...utils import VLLM_PATH, RemoteOpenAIServer +from ...utils import RemoteOpenAIServer # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" @@ -76,14 +73,7 @@ def zephyr_lora_files(): @pytest.fixture(scope="module") -def ray_ctx(): - ray.init(runtime_env={"working_dir": VLLM_PATH}) - yield - ray.shutdown() - - -@pytest.fixture(scope="module") -def server(zephyr_lora_files, ray_ctx): +def server(zephyr_lora_files): return RemoteOpenAIServer([ "--model", MODEL_NAME, diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index 52a848b7831d5..40195ae3009f8 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -6,9 +6,6 @@ import jsonschema import openai # use the official client for correctness check import pytest -# using Ray for overall ease of process management, parallel requests, -# and debugging. -import ray import requests # downloading lora to test lora requests from huggingface_hub import snapshot_download @@ -16,7 +13,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer -from ...utils import VLLM_PATH, RemoteOpenAIServer +from ...utils import RemoteOpenAIServer # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" @@ -78,14 +75,7 @@ def zephyr_lora_files(): @pytest.fixture(scope="module") -def ray_ctx(): - ray.init(runtime_env={"working_dir": VLLM_PATH}) - yield - ray.shutdown() - - -@pytest.fixture(scope="module") -def server(zephyr_lora_files, ray_ctx): +def server(zephyr_lora_files): return RemoteOpenAIServer([ "--model", MODEL_NAME, diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index f8aa1c9143a3b..8865d32d417ad 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -3,22 +3,14 @@ import numpy as np import openai import pytest -import ray -from ...utils import VLLM_PATH, RemoteOpenAIServer +from ...utils import RemoteOpenAIServer EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct" @pytest.fixture(scope="module") -def ray_ctx(): - ray.init(runtime_env={"working_dir": VLLM_PATH}) - yield - ray.shutdown() - - -@pytest.fixture(scope="module") -def embedding_server(ray_ctx): +def embedding_server(): return RemoteOpenAIServer([ "--model", EMBEDDING_MODEL_NAME, diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py index 914ef6e19e109..da147756ac61f 100644 --- a/tests/entrypoints/openai/test_models.py +++ b/tests/entrypoints/openai/test_models.py @@ -1,12 +1,9 @@ import openai # use the official client for correctness check import pytest -# using Ray for overall ease of process management, parallel requests, -# and debugging. -import ray # downloading lora to test lora requests from huggingface_hub import snapshot_download -from ...utils import VLLM_PATH, RemoteOpenAIServer +from ...utils import RemoteOpenAIServer # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" @@ -21,14 +18,7 @@ def zephyr_lora_files(): @pytest.fixture(scope="module") -def ray_ctx(): - ray.init(runtime_env={"working_dir": VLLM_PATH}) - yield - ray.shutdown() - - -@pytest.fixture(scope="module") -def server(zephyr_lora_files, ray_ctx): +def server(zephyr_lora_files): return RemoteOpenAIServer([ "--model", MODEL_NAME, diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index b869717608d0f..ed47bd7094b60 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -3,7 +3,6 @@ import openai import pytest import pytest_asyncio -import ray from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64 @@ -23,14 +22,7 @@ @pytest.fixture(scope="module") -def ray_ctx(): - ray.init(runtime_env={"working_dir": VLLM_PATH}) - yield - ray.shutdown() - - -@pytest.fixture(scope="module") -def server(ray_ctx): +def server(): return RemoteOpenAIServer([ "--model", MODEL_NAME, diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index b2ebcc15cd0fc..3e4ec0e116bc1 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -6,7 +6,6 @@ import openai import pytest -import ray import torch from tensorizer import EncryptionParams @@ -22,7 +21,7 @@ tensorize_vllm_model) from ..conftest import VllmRunner, cleanup -from ..utils import VLLM_PATH, RemoteOpenAIServer +from ..utils import RemoteOpenAIServer # yapf conflicts with isort for this docstring @@ -220,8 +219,6 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path): json.dumps(model_loader_extra_config), ] - ray.init(runtime_env={"working_dir": VLLM_PATH}) - server = RemoteOpenAIServer(openai_args) print("Server ready.") @@ -282,7 +279,6 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner, base_model.model.llm_engine.model_executor.shutdown() del base_model cleanup() - ray.shutdown() # load model with two shards and serialize with encryption model_path = str(tmp_path / (model_ref + "-%02d.tensors")) @@ -305,7 +301,6 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner, assert os.path.isfile(model_path % 0), "Serialization subprocess failed" assert os.path.isfile(model_path % 1), "Serialization subprocess failed" cleanup() - ray.shutdown() loaded_vllm_model = vllm_runner( model_ref, From 0aaf26868a303c6a5bc19278af95685c98213cf5 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 11 Jul 2024 19:48:34 -0700 Subject: [PATCH 21/27] use spawn --- .buildkite/test-pipeline.yaml | 5 ++++- tests/utils.py | 18 +++++++++++++----- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index fc747508daa74..119f4e1c39bf6 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -193,7 +193,10 @@ steps: - label: Tensorizer Test #mirror_hardwares: [amd] - command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader + commands: + - apt-get install curl libsodium23 + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s tensorizer_loader - label: Metrics Test mirror_hardwares: [amd] diff --git a/tests/utils.py b/tests/utils.py index 194d1c0a604ab..e3239f0f8c15c 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,10 +1,10 @@ import os +import subprocess import sys import time import warnings import weakref from contextlib import contextmanager -from multiprocessing import get_context from pathlib import Path from typing import Any, Dict, List @@ -74,9 +74,16 @@ def __init__(self, self.host = str(args.host or 'localhost') self.port = int(args.port) - self.proc = get_context("fork").Process(target=api_sever_runner, - args=(cli_args, )) - self.proc.start() + env = os.environ.copy() + # the current process might initialize cuda, + # to be safe, we should use spawn method + env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' + self.proc = subprocess.Popen( + [sys.executable, "-m", "vllm.entrypoints.openai.api_server"] + + cli_args, + env=env, + stdout=sys.stdout, + stderr=sys.stderr) self._wait_for_server(url=self.url_for("health"), timeout=self.MAX_SERVER_START_WAIT_S) @@ -90,7 +97,8 @@ def _wait_for_server(self, *, url: str, timeout: float): if requests.get(url).status_code == 200: break except Exception as err: - if self.proc.exitcode is not None and self.proc.exitcode != 0: + result = self.proc.poll() + if result is not None and result != 0: raise RuntimeError("Server exited unexpectedly.") from err time.sleep(0.5) From 3afa5a9e5ca975630a8af66ceca280bf33090fd6 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 11 Jul 2024 21:03:15 -0700 Subject: [PATCH 22/27] add comments --- .buildkite/run-multi-node-test.sh | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/.buildkite/run-multi-node-test.sh b/.buildkite/run-multi-node-test.sh index 53bc37732f2e0..7ac4dcc4c786d 100755 --- a/.buildkite/run-multi-node-test.sh +++ b/.buildkite/run-multi-node-test.sh @@ -3,7 +3,7 @@ set -euox pipefail if [[ $# -lt 4 ]]; then - echo "Please provide the working directory, number of nodes and GPU per node." + echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN" exit 1 fi @@ -41,19 +41,39 @@ start_nodes() { fi done GPU_DEVICES+='"' - docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "rm -rf /vllm-workspace/.git && tail -f /dev/null" + + # start the container in detached mode + # things to note: + # 1. --shm-size=10.24gb is required. don't use --ipc=host + # 2. pass HF_TOKEN to the container + # 3. map the huggingface cache directory to the container + # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes: + # starting from 192.168.10.11) + docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null" + + # organize containers into a ray cluster if [ $node -eq 0 ]; then + # start the ray head node docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block" + # wait for the head node to be ready + sleep 10 else + # start the ray worker nodes, and connect them to the head node docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block" fi - # echo "Starting node$node with GPU devices: $GPU_DEVICES" done - docker exec node0 /bin/bash -c "sleep 10 && ray status" + + # wait for the cluster to be ready + sleep 10 + + # print the cluster status + docker exec node0 /bin/bash -c "ray status" } run_nodes() { # important: iterate in reverse order to start the head node last + # we start the worker nodes first, in detached mode, and then start the head node + # in the foreground, so that the output of the head node is visible in the buildkite logs for node in $(seq $(($NUM_NODES - 1)) -1 0); do GPU_DEVICES='"device=' for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do From 00b2e7b019a722ca02d9958d73ef3c302585d465 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 11 Jul 2024 21:07:10 -0700 Subject: [PATCH 23/27] remove num_gpus --- tests/distributed/test_pipeline_parallel.py | 5 +---- tests/utils.py | 6 +----- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 1014174f5d2bf..7431d4b67f2ec 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -40,10 +40,7 @@ def server(): args += [ "--enforce-eager", ] - # NOTE: ray cannot allocate one actor with multiple GPUs across nodes. - # we just tell ray to use 0 GPUs here, so that the actor can be launched. - # the process itself will launch the server with the correct number of GPUs. - return RemoteOpenAIServer(args, num_gpus=0) + return RemoteOpenAIServer(args) @pytest.fixture(scope="module") diff --git a/tests/utils.py b/tests/utils.py index e3239f0f8c15c..7821b070ca96b 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -57,11 +57,7 @@ class RemoteOpenAIServer: DUMMY_API_KEY = "token-abc123" # vLLM's OpenAI server does not need API key MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds - def __init__(self, - cli_args: List[str], - *, - auto_port: bool = True, - num_gpus: int = 1) -> None: + def __init__(self, cli_args: List[str], *, auto_port: bool = True) -> None: if auto_port: if "-p" in cli_args or "--port" in cli_args: raise ValueError("You have manually specified the port" From 88ff95caa1fb4aad7e34c9bf5de2f4ea37824578 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 11 Jul 2024 21:09:27 -0700 Subject: [PATCH 24/27] remove useless code --- tests/utils.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index 7821b070ca96b..e2b7c7ca97a95 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -46,13 +46,6 @@ def _nvml(): """Path to root of the vLLM repository.""" -def api_sever_runner(cli_args: List[str]) -> None: - os.environ["PYTHONUNBUFFERED"] = "1" - sys.argv = ["vllm.entrypoints.openai.api_server"] + cli_args - import runpy - runpy.run_module("vllm.entrypoints.openai.api_server", run_name="__main__") - - class RemoteOpenAIServer: DUMMY_API_KEY = "token-abc123" # vLLM's OpenAI server does not need API key MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds From cd43c312a30b816906d718f00a0da6c0547770a1 Mon Sep 17 00:00:00 2001 From: Muralidhar Andoorveedu Date: Sat, 13 Jul 2024 00:04:01 +0000 Subject: [PATCH 25/27] Multinode hang fix Signed-off-by: Muralidhar Andoorveedu --- vllm/executor/ray_gpu_executor.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 6e13264aba233..520146caaf508 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -224,16 +224,13 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", # broadcasted to. self.non_driver_workers: List[RayWorkerWrapper] = [] - for pp_rank in range(self.parallel_config.pipeline_parallel_size): - for tp_rank in range(self.parallel_config.tensor_parallel_size): - rank = (pp_rank * - self.parallel_config.tensor_parallel_size) + tp_rank - if rank == 0: - pass - elif rank % self.parallel_config.tensor_parallel_size == 0: - self.tp_driver_workers.append(self.workers[rank - 1]) - else: - self.non_driver_workers.append(self.workers[rank - 1]) + for idx, rank in enumerate(worker_ranks[1:]): + if rank == 0: + pass + elif rank % self.parallel_config.tensor_parallel_size == 0: + self.tp_driver_workers.append(self.workers[idx]) + else: + self.non_driver_workers.append(self.workers[idx]) def _driver_execute_model( self, execute_model_req: Optional[ExecuteModelRequest] From 9f71fb294ac5157159b95d0e7a88cbd50e7046f5 Mon Sep 17 00:00:00 2001 From: Muralidhar Andoorveedu Date: Sat, 13 Jul 2024 00:33:02 +0000 Subject: [PATCH 26/27] Get rid of first if condition Signed-off-by: Muralidhar Andoorveedu --- vllm/executor/ray_gpu_executor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 520146caaf508..388f934ef75a6 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -225,9 +225,9 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", self.non_driver_workers: List[RayWorkerWrapper] = [] for idx, rank in enumerate(worker_ranks[1:]): - if rank == 0: - pass - elif rank % self.parallel_config.tensor_parallel_size == 0: + # We need to skip the driver worker, which we + # do by skipping worker_ranks[0] which is always 0. + if rank % self.parallel_config.tensor_parallel_size == 0: self.tp_driver_workers.append(self.workers[idx]) else: self.non_driver_workers.append(self.workers[idx]) From 0e037ccc57af780424c97b608ddaf25a98ad7bf1 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 12 Jul 2024 19:55:35 -0700 Subject: [PATCH 27/27] use context manager for remote server --- tests/async_engine/test_openapi_server_ray.py | 23 +++++----- tests/distributed/test_pipeline_parallel.py | 3 +- tests/entrypoints/openai/test_chat.py | 43 ++++++++++--------- tests/entrypoints/openai/test_completion.py | 43 ++++++++++--------- tests/entrypoints/openai/test_embedding.py | 23 +++++----- tests/entrypoints/openai/test_models.py | 43 ++++++++++--------- tests/entrypoints/openai/test_vision.py | 23 +++++----- tests/tensorizer_loader/test_tensorizer.py | 30 ++++++------- tests/utils.py | 7 ++- 9 files changed, 124 insertions(+), 114 deletions(-) diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py index 26d4e6a359644..575f8f19b8ebe 100644 --- a/tests/async_engine/test_openapi_server_ray.py +++ b/tests/async_engine/test_openapi_server_ray.py @@ -9,17 +9,18 @@ @pytest.fixture(scope="module") def server(): - return RemoteOpenAIServer([ - "--model", - MODEL_NAME, - # use half precision for speed and memory savings in CI environment - "--dtype", - "float16", - "--max-model-len", - "2048", - "--enforce-eager", - "--engine-use-ray" - ]) + with RemoteOpenAIServer([ + "--model", + MODEL_NAME, + # use half precision for speed and memory savings in CI environment + "--dtype", + "float16", + "--max-model-len", + "2048", + "--enforce-eager", + "--engine-use-ray" + ]) as remote_server: + yield remote_server @pytest.fixture(scope="module") diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 7431d4b67f2ec..2d9f63795189d 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -40,7 +40,8 @@ def server(): args += [ "--enforce-eager", ] - return RemoteOpenAIServer(args) + with RemoteOpenAIServer(args) as remote_server: + yield remote_server @pytest.fixture(scope="module") diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index e416fead81c49..d370c63c0c7ba 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -27,27 +27,28 @@ def zephyr_lora_files(): @pytest.fixture(scope="module") def server(zephyr_lora_files): - return RemoteOpenAIServer([ - "--model", - MODEL_NAME, - # use half precision for speed and memory savings in CI environment - "--dtype", - "bfloat16", - "--max-model-len", - "8192", - "--enforce-eager", - # lora config below - "--enable-lora", - "--lora-modules", - f"zephyr-lora={zephyr_lora_files}", - f"zephyr-lora2={zephyr_lora_files}", - "--max-lora-rank", - "64", - "--max-cpu-loras", - "2", - "--max-num-seqs", - "128", - ]) + with RemoteOpenAIServer([ + "--model", + MODEL_NAME, + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "8192", + "--enforce-eager", + # lora config below + "--enable-lora", + "--lora-modules", + f"zephyr-lora={zephyr_lora_files}", + f"zephyr-lora2={zephyr_lora_files}", + "--max-lora-rank", + "64", + "--max-cpu-loras", + "2", + "--max-num-seqs", + "128", + ]) as remote_server: + yield remote_server @pytest.fixture(scope="module") diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index 73df487a10e33..6e5fdebe786e1 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -29,27 +29,28 @@ def zephyr_lora_files(): @pytest.fixture(scope="module") def server(zephyr_lora_files): - return RemoteOpenAIServer([ - "--model", - MODEL_NAME, - # use half precision for speed and memory savings in CI environment - "--dtype", - "bfloat16", - "--max-model-len", - "8192", - "--enforce-eager", - # lora config below - "--enable-lora", - "--lora-modules", - f"zephyr-lora={zephyr_lora_files}", - f"zephyr-lora2={zephyr_lora_files}", - "--max-lora-rank", - "64", - "--max-cpu-loras", - "2", - "--max-num-seqs", - "128", - ]) + with RemoteOpenAIServer([ + "--model", + MODEL_NAME, + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "8192", + "--enforce-eager", + # lora config below + "--enable-lora", + "--lora-modules", + f"zephyr-lora={zephyr_lora_files}", + f"zephyr-lora2={zephyr_lora_files}", + "--max-lora-rank", + "64", + "--max-cpu-loras", + "2", + "--max-num-seqs", + "128", + ]) as remote_server: + yield remote_server @pytest.fixture(scope="module") diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index 8865d32d417ad..4a32aadc8c3ae 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -11,17 +11,18 @@ @pytest.fixture(scope="module") def embedding_server(): - return RemoteOpenAIServer([ - "--model", - EMBEDDING_MODEL_NAME, - # use half precision for speed and memory savings in CI environment - "--dtype", - "bfloat16", - "--enforce-eager", - "--max-model-len", - "8192", - "--enforce-eager", - ]) + with RemoteOpenAIServer([ + "--model", + EMBEDDING_MODEL_NAME, + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--enforce-eager", + "--max-model-len", + "8192", + "--enforce-eager", + ]) as remote_server: + yield remote_server @pytest.mark.asyncio diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py index da147756ac61f..bf63f9a813f2c 100644 --- a/tests/entrypoints/openai/test_models.py +++ b/tests/entrypoints/openai/test_models.py @@ -19,27 +19,28 @@ def zephyr_lora_files(): @pytest.fixture(scope="module") def server(zephyr_lora_files): - return RemoteOpenAIServer([ - "--model", - MODEL_NAME, - # use half precision for speed and memory savings in CI environment - "--dtype", - "bfloat16", - "--max-model-len", - "8192", - "--enforce-eager", - # lora config below - "--enable-lora", - "--lora-modules", - f"zephyr-lora={zephyr_lora_files}", - f"zephyr-lora2={zephyr_lora_files}", - "--max-lora-rank", - "64", - "--max-cpu-loras", - "2", - "--max-num-seqs", - "128", - ]) + with RemoteOpenAIServer([ + "--model", + MODEL_NAME, + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "8192", + "--enforce-eager", + # lora config below + "--enable-lora", + "--lora-modules", + f"zephyr-lora={zephyr_lora_files}", + f"zephyr-lora2={zephyr_lora_files}", + "--max-lora-rank", + "64", + "--max-cpu-loras", + "2", + "--max-num-seqs", + "128", + ]) as remote_server: + yield remote_server @pytest.fixture(scope="module") diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index ed47bd7094b60..563b68566bd2c 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -23,17 +23,18 @@ @pytest.fixture(scope="module") def server(): - return RemoteOpenAIServer([ - "--model", - MODEL_NAME, - "--dtype", - "bfloat16", - "--max-model-len", - "4096", - "--enforce-eager", - "--chat-template", - str(LLAVA_CHAT_TEMPLATE), - ]) + with RemoteOpenAIServer([ + "--model", + MODEL_NAME, + "--dtype", + "bfloat16", + "--max-model-len", + "4096", + "--enforce-eager", + "--chat-template", + str(LLAVA_CHAT_TEMPLATE), + ]) as remote_server: + yield remote_server @pytest.fixture(scope="module") diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index 3e4ec0e116bc1..a43f9132585b5 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -219,21 +219,21 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path): json.dumps(model_loader_extra_config), ] - server = RemoteOpenAIServer(openai_args) - print("Server ready.") - - client = server.get_client() - completion = client.completions.create(model=model_ref, - prompt="Hello, my name is", - max_tokens=5, - temperature=0.0) - - assert completion.id is not None - assert len(completion.choices) == 1 - assert len(completion.choices[0].text) >= 5 - assert completion.choices[0].finish_reason == "length" - assert completion.usage == openai.types.CompletionUsage( - completion_tokens=5, prompt_tokens=6, total_tokens=11) + with RemoteOpenAIServer(openai_args) as server: + print("Server ready.") + + client = server.get_client() + completion = client.completions.create(model=model_ref, + prompt="Hello, my name is", + max_tokens=5, + temperature=0.0) + + assert completion.id is not None + assert len(completion.choices) == 1 + assert len(completion.choices[0].text) >= 5 + assert completion.choices[0].finish_reason == "length" + assert completion.usage == openai.types.CompletionUsage( + completion_tokens=5, prompt_tokens=6, total_tokens=11) def test_raise_value_error_on_invalid_load_format(vllm_runner): diff --git a/tests/utils.py b/tests/utils.py index e2b7c7ca97a95..50f723b0b18a3 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -3,7 +3,6 @@ import sys import time import warnings -import weakref from contextlib import contextmanager from pathlib import Path from typing import Any, Dict, List @@ -76,7 +75,11 @@ def __init__(self, cli_args: List[str], *, auto_port: bool = True) -> None: self._wait_for_server(url=self.url_for("health"), timeout=self.MAX_SERVER_START_WAIT_S) - weakref.finalize(self, self.proc.terminate) + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.proc.terminate() def _wait_for_server(self, *, url: str, timeout: float): # run health check