Merge branch 'main' into fix-forked-error

vllm-project · Mar 28, 2024 · b415b96 · b415b96
2 parents a1bd55d + 4716a32
commit b415b96
Show file tree

Hide file tree

Showing 34 changed files with 1,520 additions and 108 deletions.
diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/run-benchmarks.sh
@@ -49,7 +49,9 @@ sed -n '$p' benchmark_throughput.txt >> benchmark_results.md # last line
 echo "### Serving Benchmarks" >> benchmark_results.md
 sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
 echo "" >> benchmark_results.md
-tail -n 13 benchmark_serving.txt >> benchmark_results.md # last 13 lines
+echo '```' >> benchmark_results.md
+tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
+echo '```' >> benchmark_results.md
 
 # upload the results to buildkite
 /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -36,6 +36,16 @@ steps:
 - label: Entrypoints Test
   command: pytest -v -s entrypoints
 
+- label: Examples Test
+  working_dir: "/vllm-workspace/examples"
+  commands:
+    # install aws cli for llava_example.py
+    - pip install awscli
+    - python3 offline_inference.py
+    - python3 offline_inference_with_prefix.py
+    - python3 llm_engine_example.py
+    - python3 llava_example.py
+
 - label: Kernels Test %N
   command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 4

diff --git a/README.md b/README.md
@@ -89,6 +89,7 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
 - Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
 - Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
 - Qwen2 (`Qwen/Qwen2-7B-beta`, `Qwen/Qwen-7B-Chat-beta`, etc.)
+- Qwen2MoE (`Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.)
 - StableLM(`stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.)
 - Starcoder2(`bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.)
 - Xverse (`xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
@@ -26,7 +26,9 @@ def main(args: argparse.Namespace):
               kv_cache_dtype=args.kv_cache_dtype,
               device=args.device,
               ray_workers_use_nsight=args.ray_workers_use_nsight,
-              download_dir=args.download_dir)
+              enable_chunked_prefill=args.enable_chunked_prefill,
+              download_dir=args.download_dir,
+              block_size=args.block_size)
 
     sampling_params = SamplingParams(
         n=args.n,
@@ -145,6 +147,16 @@ def run_to_completion(profile_dir: Optional[str] = None):
         default="cuda",
         choices=["cuda"],
         help='device type for vLLM execution, supporting CUDA only currently.')
+    parser.add_argument('--block-size',
+                        type=int,
+                        default=16,
+                        help='block size of key/value cache')
+    parser.add_argument(
+        '--enable-chunked-prefill',
+        type=bool,
+        default=False,
+        help='If True, the prefill requests can be chunked based on the '
+        'max_num_batched_tokens')
     parser.add_argument(
         "--ray-workers-use-nsight",
         action='store_true',

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
@@ -119,6 +119,10 @@ Alongside each architecture, we include some popular models that use it.
     - Qwen2
     - :code:`Qwen/Qwen2-beta-7B`, :code:`Qwen/Qwen2-beta-7B-Chat`, etc.
     - ✅︎
+  * - :code:`Qwen2MoeForCausalLM`
+    - Qwen2MoE
+    - :code:`Qwen/Qwen1.5-MoE-A2.7B`, :code:`Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.
+    - 
   * - :code:`StableLmForCausalLM`
     - StableLM
     - :code:`stabilityai/stablelm-3b-4e1t/` , :code:`stabilityai/stablelm-base-alpha-7b-v2`, etc.

diff --git a/examples/llava_example.py b/examples/llava_example.py
@@ -78,7 +78,13 @@ def main(args):
     # Make sure the local directory exists or create it
     os.makedirs(local_directory, exist_ok=True)
 
-    # Use AWS CLI to sync the directory
-    subprocess.check_call(
-        ["aws", "s3", "sync", s3_bucket_path, local_directory])
+    # Use AWS CLI to sync the directory, assume anonymous access
+    subprocess.check_call([
+        "aws",
+        "s3",
+        "sync",
+        s3_bucket_path,
+        local_directory,
+        "--no-sign-request",
+    ])
     main(args)
diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference_with_prefix.py
@@ -22,7 +22,7 @@
 sampling_params = SamplingParams(temperature=0.0)
 
 # Create an LLM.
-llm = LLM(model="facebook/opt-125m")
+llm = LLM(model="facebook/opt-125m", enable_prefix_caching=True)
 
 generating_prompts = [prefix + prompt for prompt in prompts]
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -281,6 +281,8 @@ def __init__(
         dtype: str = "half",
         disable_log_stats: bool = True,
         tensor_parallel_size: int = 1,
+        block_size: int = 16,
+        enable_chunked_prefill: bool = False,
         **kwargs,
     ) -> None:
         self.model = LLM(
@@ -292,6 +294,8 @@ def __init__(
             disable_log_stats=disable_log_stats,
             tensor_parallel_size=tensor_parallel_size,
             max_model_len=max_model_len,
+            block_size=block_size,
+            enable_chunked_prefill=enable_chunked_prefill,
             **kwargs,
         )
 

diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
@@ -10,6 +10,10 @@
 from .utils import create_dummy_prompt
 
 
+def get_sequence_groups(scheduler_output):
+    return [s.seq_group for s in scheduler_output.scheduled_seq_groups]
+
+
 def test_scheduler_add_seq_group():
     block_size = 4
     scheduler_config = SchedulerConfig(100, 64, 1)
@@ -57,9 +61,9 @@ def test_scheduler_schedule_simple():
     cache_config.num_cpu_blocks = 8
     cache_config.num_gpu_blocks = 8
     scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: List[SequenceGroup] = []
 
     # Add seq groups to scheduler.
-    running: List[SequenceGroup] = []
     for i in range(num_seq_group):
         _, seq_group = create_dummy_prompt(str(i), prompt_length=block_size)
         scheduler.add_seq_group(seq_group)
@@ -68,15 +72,15 @@ def test_scheduler_schedule_simple():
     # Schedule seq groups prompts.
     num_tokens = block_size * num_seq_group
     seq_group_meta, out = scheduler.schedule()
-    assert set(out.scheduled_seq_groups) == set(running)
+    assert set(get_sequence_groups(out)) == set(running)
     assert out.num_batched_tokens == num_tokens
     assert (not out.blocks_to_copy and not out.blocks_to_swap_in
             and not out.blocks_to_swap_out)
     assert len(seq_group_meta) == num_seq_group
 
     # Schedule seq groups generation.
     seq_group_meta, out = scheduler.schedule()
-    assert set(out.scheduled_seq_groups) == set(running)
+    assert set(get_sequence_groups(out)) == set(running)
     assert out.num_batched_tokens == num_seq_group
     assert (not out.blocks_to_copy and not out.blocks_to_swap_in
             and not out.blocks_to_swap_out)
@@ -100,7 +104,7 @@ def test_scheduler_schedule_preempt_abort():
 
     # Schedule seq groups prompts.
     seq_group_meta, out = scheduler.schedule()
-    assert out.scheduled_seq_groups == [seq_group_a, seq_group_b]
+    assert get_sequence_groups(out) == [seq_group_a, seq_group_b]
     assert out.num_batched_tokens == block_size * 2  # seq_a and seq_b
     assert (not out.blocks_to_copy and not out.blocks_to_swap_in
             and not out.blocks_to_swap_out)
@@ -115,7 +119,7 @@ def test_scheduler_schedule_preempt_abort():
 
     # Schedule seq groups generation and preempt seq group b.
     seq_group_meta, out = scheduler.schedule()
-    assert out.scheduled_seq_groups == [seq_group_a]
+    assert get_sequence_groups(out) == [seq_group_a]
     assert out.num_batched_tokens == 1
     assert (not out.blocks_to_copy and not out.blocks_to_swap_in
             and not out.blocks_to_swap_out)
@@ -125,7 +129,7 @@ def test_scheduler_schedule_preempt_abort():
     # Abort seq group a. Re-schedule seq group b prompt with recomputation.
     scheduler.abort_seq_group("1")
     seq_group_meta, out = scheduler.schedule()
-    assert out.scheduled_seq_groups == [seq_group_b]
+    assert get_sequence_groups(out) == [seq_group_b]
     assert out.num_batched_tokens == 5  # 4 prompt + 1 generation.
     assert (not out.blocks_to_copy and not out.blocks_to_swap_in
             and not out.blocks_to_swap_out)
@@ -155,11 +159,11 @@ def test_scheduler_max_seqs():
 
     # Schedule seq groups prompts.
     _, out = scheduler.schedule()
-    assert set(out.scheduled_seq_groups) == set([all_seq_groups[0]])
+    assert set(get_sequence_groups(out)) == set([all_seq_groups[0]])
 
     # Schedule seq groups generation.
     _, out = scheduler.schedule()
-    assert set(out.scheduled_seq_groups) == set([all_seq_groups[0]])
+    assert set(get_sequence_groups(out)) == set([all_seq_groups[0]])
 
     # Append 2 more seq group
     scheduler.add_seq_group(all_seq_groups[1])
@@ -169,7 +173,7 @@ def test_scheduler_max_seqs():
     # Only 1 seq group should be scheduled since max_seq_group is 2
     # and one is prompting.
     _, out = scheduler.schedule()
-    assert set(out.scheduled_seq_groups) == set([all_seq_groups[1]])
+    assert set(get_sequence_groups(out)) == set([all_seq_groups[1]])
 
 
 def test_scheduler_delay_factor():

diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
@@ -24,7 +24,7 @@ def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
     del os.environ["CUDA_VISIBLE_DEVICES"]
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
-    init_test_distributed_environment(1, tensor_parallel_size, rank,
+    init_test_distributed_environment(1, tensor_parallel_size, rank, rank,
                                       distributed_init_port)
     num_elements = 8
     all_tensors = [
@@ -46,7 +46,7 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int,
     del os.environ["CUDA_VISIBLE_DEVICES"]
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
-    init_test_distributed_environment(1, tensor_parallel_size, rank,
+    init_test_distributed_environment(1, tensor_parallel_size, rank, rank,
                                       distributed_init_port)
     num_dimensions = 3
     tensor_size = list(range(2, num_dimensions + 2))
@@ -74,7 +74,7 @@ def broadcast_tensor_dict_test_worker(tensor_parallel_size: int, rank: int,
     del os.environ["CUDA_VISIBLE_DEVICES"]
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
-    init_test_distributed_environment(1, tensor_parallel_size, rank,
+    init_test_distributed_environment(1, tensor_parallel_size, rank, rank,
                                       distributed_init_port)
     test_dict = {
         "a": torch.arange(8, dtype=torch.float32, device="cuda"),

diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
@@ -23,7 +23,7 @@ def graph_allreduce(world_size, rank, distributed_init_port):
     del os.environ["CUDA_VISIBLE_DEVICES"]
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
-    init_test_distributed_environment(1, world_size, rank,
+    init_test_distributed_environment(1, world_size, rank, rank,
                                       distributed_init_port)
 
     custom_ar.init_custom_ar()
@@ -58,7 +58,7 @@ def eager_allreduce(world_size, rank, distributed_init_port):
     del os.environ["CUDA_VISIBLE_DEVICES"]
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
-    init_test_distributed_environment(1, world_size, rank,
+    init_test_distributed_environment(1, world_size, rank, rank,
                                       distributed_init_port)
 
     sz = 1024

diff --git a/tests/test_sequence.py b/tests/test_sequence.py
@@ -1,6 +1,7 @@
 import pytest
 
-from vllm.sequence import SamplerOutput, SequenceGroupOutput, SequenceOutput
+from vllm.sequence import (SamplerOutput, SequenceData, SequenceGroupOutput,
+                           SequenceOutput)
 
 
 @pytest.fixture
@@ -48,3 +49,24 @@ def test_sampler_output_eq(sample_outputs):
     sampler_output3 = SamplerOutput(outputs=sample_outputs[:-1])
     assert sampler_output1 == sampler_output2
     assert sampler_output1 != sampler_output3
+
+
+def test_sequence_data_prefill():
+    seq_data = SequenceData(prompt_token_ids=[1, 2, 3, 4])
+    assert seq_data.get_num_uncomputed_tokens() == 4
+    assert seq_data.get_num_computed_tokens() == 0
+    # advance by 2
+    seq_data.update_num_computed_tokens(2)
+    assert seq_data.get_num_uncomputed_tokens() == 2
+    assert seq_data.get_num_computed_tokens() == 2
+
+    # advance by 1
+    seq_data.update_num_computed_tokens(1)
+    assert seq_data.get_num_uncomputed_tokens() == 1
+    assert seq_data.get_num_computed_tokens() == 3
+
+    # append tokens and reset, simulating recompute
+    seq_data.append_token_id(1, logprob=0.0)
+    seq_data.reset_num_computed_tokens()
+    assert seq_data.get_num_uncomputed_tokens() == 5
+    assert seq_data.get_num_computed_tokens() == 0
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
@@ -18,15 +18,16 @@ def test_prepare_prompt(batch_size):
         # make sure all tokens fit into one block
         prompt_len = i % (model_runner.block_size - 1) + 1
         prompt_lens.append(prompt_len)
-        seq_data = list(range(prompt_len))
-        seq_group_metadata_list.append(
-            SequenceGroupMetadata(
-                request_id=f"test_{i}",
-                is_prompt=True,
-                seq_data={0: SequenceData(seq_data)},
-                sampling_params=SamplingParams(temperature=0),
-                block_tables=block_tables,
-            ))
+        seq_data = SequenceData(list(range(prompt_len)))
+        seq_group_metadata = SequenceGroupMetadata(
+            request_id=f"test_{i}",
+            is_prompt=True,
+            seq_data={0: seq_data},
+            sampling_params=SamplingParams(temperature=0),
+            block_tables=block_tables,
+        )
+        assert seq_group_metadata.token_chunk_size == seq_data.get_len()
+        seq_group_metadata_list.append(seq_group_metadata)
 
     expected_selected_token_indices = []
     selected_token_start_idx = 0
@@ -131,14 +132,16 @@ def test_prepare_decode_cuda_graph(batch_size):
         prompt_len = i % (model_runner.block_size - 1) + 1
         prompt_lens.append(prompt_len)
         seq_data = list(range(prompt_len))
-        seq_group_metadata_list.append(
-            SequenceGroupMetadata(
-                request_id=f"test_{i}",
-                is_prompt=False,
-                seq_data={0: SequenceData(seq_data)},
-                sampling_params=SamplingParams(temperature=0),
-                block_tables={0: [1]},
-            ))
+        seq_data = SequenceData(seq_data)
+        seq_group_metadata = SequenceGroupMetadata(
+            request_id=f"test_{i}",
+            is_prompt=False,
+            seq_data={0: seq_data},
+            sampling_params=SamplingParams(temperature=0),
+            block_tables={0: [1]},
+        )
+        assert seq_group_metadata.token_chunk_size == 1
+        seq_group_metadata_list.append(seq_group_metadata)
 
     input_tokens, input_positions, attn_metadata, _, _, _ = (
         model_runner._prepare_decode(seq_group_metadata_list))

diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
@@ -41,6 +41,8 @@ def _can_use_flash_attn(dtype: torch.dtype) -> bool:
     try:
         import flash_attn  # noqa: F401
     except ImportError:
-        logger.info("flash_attn is not found.")
+        logger.info(
+            "Cannot use FlashAttention because the package is not found. "
+            "Please install it for better performance.")
         return False
     return True
diff --git a/vllm/config.py b/vllm/config.py
@@ -533,6 +533,8 @@ class SchedulerConfig:
         delay_factor: Apply a delay (of delay factor multiplied by previous
             prompt latency) before scheduling next prompt.
         use_v2_block_manager: Whether to use the BlockSpaceManagerV2 or not.
+        enable_chunked_prefill: If True, prefill requests can be chunked based
+            on the remaining max_num_batched_tokens.
     """
 
     def __init__(
@@ -542,6 +544,7 @@ def __init__(
         max_model_len: int,
         use_v2_block_manager: bool = False,
         delay_factor: float = 0.0,
+        enable_chunked_prefill: bool = False,
     ) -> None:
         if max_num_batched_tokens is not None:
             self.max_num_batched_tokens = max_num_batched_tokens
@@ -553,6 +556,7 @@ def __init__(
         self.max_model_len = max_model_len
         self.delay_factor = delay_factor
         self.use_v2_block_manager = use_v2_block_manager
+        self.chunked_prefill_enabled = enable_chunked_prefill
         self._verify_args()
 
     def _verify_args(self) -> None:

diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
@@ -230,13 +230,12 @@ def __init__(
         self.watermark_blocks = int(watermark * num_gpu_blocks)
 
         if self.enable_caching:
-            logger.info("enable automatic prefix caching")
+            logger.info("Automatic prefix caching is enabled.")
             self.gpu_allocator = CachedBlockAllocator(Device.GPU, block_size,
                                                       num_gpu_blocks)
             self.cpu_allocator = CachedBlockAllocator(Device.CPU, block_size,
                                                       num_cpu_blocks)
         else:
-            logger.info("disable automatic prefix caching")
             self.gpu_allocator = UncachedBlockAllocator(
                 Device.GPU, block_size, num_gpu_blocks)
             self.cpu_allocator = UncachedBlockAllocator(