[Speculative Decoding] Support draft model on different tensor-paral…

…lel size than target model (#5414)
vllm-project · Jun 25, 2024 · 2ce5d66 · 2ce5d66
1 parent f23871e
commit 2ce5d66
Show file tree

Hide file tree

Showing 11 changed files with 388 additions and 59 deletions.
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -54,7 +54,7 @@ steps:
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - pytest -v -s spec_decode/e2e/test_integration_dist.py
+  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
 
@@ -71,6 +71,7 @@ steps:
   # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
 
 - label: Engine Test
   mirror_hardwares: [amd]

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
@@ -25,6 +25,8 @@ def main(args: argparse.Namespace):
         model=args.model,
         speculative_model=args.speculative_model,
         num_speculative_tokens=args.num_speculative_tokens,
+        speculative_draft_tensor_parallel_size=\
+            args.speculative_draft_tensor_parallel_size,
         tokenizer=args.tokenizer,
         quantization=args.quantization,
         tensor_parallel_size=args.tensor_parallel_size,
@@ -127,6 +129,10 @@ def run_to_completion(profile_dir: Optional[str] = None):
     parser.add_argument('--model', type=str, default='facebook/opt-125m')
     parser.add_argument('--speculative-model', type=str, default=None)
     parser.add_argument('--num-speculative-tokens', type=int, default=None)
+    parser.add_argument('--speculative-draft-tensor-parallel-size',
+                        '-spec-draft-tp',
+                        type=int,
+                        default=None)
     parser.add_argument('--tokenizer', type=str, default=None)
     parser.add_argument('--quantization',
                         '-q',

diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py
@@ -0,0 +1,111 @@
+"""Tests which cover integration of the speculative decoding framework with
+tensor parallelism.
+"""
+
+import pytest
+import torch
+
+from vllm.utils import is_hip
+
+from .conftest import run_greedy_equality_correctness_test
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": "JackFram/llama-68m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+        "tensor_parallel_size": 2,
+
+        # Use AsyncLLM engine, so that the engine runs in its own process.
+        # Otherwise, since vLLM does not follow true SPMD, the test runner
+        # process will have both the engine and the rank0 worker. NCCL is not
+        # cleaned up properly, and its server host thread leaks, causing the
+        # second run of the test to fail with internal NCCL error.
+        "use_async": True,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 3,
+    },
+    {
+        "speculative_model": "[ngram]",
+        "num_speculative_tokens": 5,
+        "ngram_prompt_lookup_max": 3,
+    },
+])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator,
+                              batch_size: int, output_len: int):
+    """Verify greedy equality when tensor parallelism is used.
+    """
+    if is_hip():
+        pytest.skip("hip is not well-supported yet")
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        # Note this is repeated in the test body; to initialize a tokenizer.
+        "model": "JackFram/llama-68m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+        "tensor_parallel_size": 2,
+
+        # Use AsyncLLM engine, so that the engine runs in its own process.
+        # Otherwise, since vLLM does not follow true SPMD, the test runner
+        # process will have both the engine and the rank0 worker. NCCL is not
+        # cleaned up properly, and its server host thread leaks, causing the
+        # second run of the test to fail with internal NCCL error.
+        "use_async": True,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+        "speculative_draft_tensor_parallel_size": 1,
+    },
+])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize("seed", [1])
+def test_draft_model_tp_lt_target_model_tp2(test_llm_generator,
+                                            baseline_llm_generator,
+                                            batch_size: int):
+    """Verify spec decode works well with smaller tp for draft models.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=32,
+                                         force_output_len=True)
diff --git a/.../spec_decode/e2e/test_integration_dist.py → ...c_decode/e2e/test_integration_dist_tp4.py b/.../spec_decode/e2e/test_integration_dist.py → ...c_decode/e2e/test_integration_dist_tp4.py
@@ -5,24 +5,24 @@
 import pytest
 import torch
 
-from vllm.utils import is_hip
-
 from .conftest import run_greedy_equality_correctness_test
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(torch.cuda.device_count() < 4,
+                    reason="Need at least 4 GPUs to run the test.")
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{
+        # Use a small model for a fast test.
+        # Note this is repeated in the test body; to initialize a tokenizer.
         "model": "JackFram/llama-68m",
 
         # Skip cuda graph recording for fast test.
         "enforce_eager": True,
 
         # Required for spec decode.
         "use_v2_block_manager": True,
-        "tensor_parallel_size": 2,
+        "tensor_parallel_size": 4,
 
         # Use AsyncLLM engine, so that the engine runs in its own process.
         # Otherwise, since vLLM does not follow true SPMD, the test runner
@@ -31,35 +31,30 @@
         # second run of the test to fail with internal NCCL error.
         "use_async": True,
     }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [
     {
         "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 3,
-    },
-    {
-        "speculative_model": "[ngram]",
         "num_speculative_tokens": 5,
-        "ngram_prompt_lookup_max": 3,
     },
 ])
-@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize(
-    "output_len",
+    "test_llm_kwargs",
     [
-        # Use smaller output len for fast test.
-        32,
+        #TODO(wooyeon): add spec_draft_dp=2 case
+        {
+            "speculative_draft_tensor_parallel_size": 1,
+        },
     ])
+@pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize("seed", [1])
-def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator,
-                              batch_size: int, output_len: int):
-    """Verify greedy equality when tensor parallelism is used.
+def test_draft_model_tp_lt_target_model_tp4(test_llm_generator,
+                                            baseline_llm_generator,
+                                            batch_size: int):
+    """Verify spec decode works well with smaller tp for draft models.
     """
-    if is_hip():
-        pytest.skip("hip is not well-supported yet")
     run_greedy_equality_correctness_test(baseline_llm_generator,
                                          test_llm_generator,
                                          batch_size,
-                                         max_output_len=output_len,
+                                         max_output_len=32,
                                          force_output_len=True)
diff --git a/vllm/config.py b/vllm/config.py
@@ -797,6 +797,7 @@ def maybe_create_spec_config(
         target_parallel_config: ParallelConfig,
         target_dtype: str,
         speculative_model: Optional[str],
+        speculative_draft_tensor_parallel_size: Optional[int],
         num_speculative_tokens: Optional[int],
         speculative_max_model_len: Optional[int],
         enable_chunked_prefill: bool,
@@ -819,6 +820,8 @@ def maybe_create_spec_config(
             target_dtype (str): The data type used for the target model.
             speculative_model (Optional[str]): The name of the speculative
                 model, if provided.
+            speculative_draft_tensor_parallel_size (Optional[int]): The degree
+                of the tensor parallelism for the draft model.
             num_speculative_tokens (Optional[int]): The number of speculative
                 tokens, if provided. Will default to the number in the draft
                 model config if present, otherwise is required.
@@ -939,7 +942,8 @@ def maybe_create_spec_config(
 
             draft_parallel_config = (
                 SpeculativeConfig.create_draft_parallel_config(
-                    target_parallel_config))
+                    target_parallel_config,
+                    speculative_draft_tensor_parallel_size))
 
         if num_speculative_tokens is None:
             raise ValueError(
@@ -993,16 +997,26 @@ def _maybe_override_draft_max_model_len(
 
     @staticmethod
     def create_draft_parallel_config(
-            target_parallel_config: ParallelConfig) -> ParallelConfig:
+        target_parallel_config: ParallelConfig,
+        speculative_draft_tensor_parallel_size: Optional[int]
+    ) -> ParallelConfig:
         """Create a parallel config for use by the draft worker.
 
-        This is mostly a copy of the target parallel config. In the future the
-        draft worker can have a different parallel strategy, e.g. TP=1.
+        This is mostly a copy of the target parallel config, except the tp_size.
         """
+        if speculative_draft_tensor_parallel_size is None:
+            speculative_draft_tensor_parallel_size = \
+                  target_parallel_config.tensor_parallel_size
+        elif speculative_draft_tensor_parallel_size != 1:
+            # TODO(wooyeon): allow tp values larger than 1
+            raise ValueError(
+                f"{speculative_draft_tensor_parallel_size=} cannot be"
+                f"other value than 1")
+
         draft_parallel_config = ParallelConfig(
             pipeline_parallel_size=target_parallel_config.
             pipeline_parallel_size,
-            tensor_parallel_size=target_parallel_config.tensor_parallel_size,
+            tensor_parallel_size=speculative_draft_tensor_parallel_size,
             distributed_executor_backend=target_parallel_config.
             distributed_executor_backend,
             max_parallel_loading_workers=target_parallel_config.

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
@@ -676,6 +676,28 @@ def get_world_group() -> GroupCoordinator:
     return _WORLD
 
 
+def init_world_group(ranks: List[int], local_rank: int,
+                     backend: str) -> GroupCoordinator:
+    return GroupCoordinator(
+        group_ranks=[ranks],
+        local_rank=local_rank,
+        torch_distributed_backend=backend,
+        use_pynccl=False,
+        use_custom_allreduce=False,
+    )
+
+
+def init_model_parallel_group(group_ranks: List[List[int]], local_rank: int,
+                              backend: str) -> GroupCoordinator:
+    return GroupCoordinator(
+        group_ranks=group_ranks,
+        local_rank=local_rank,
+        torch_distributed_backend=backend,
+        use_pynccl=True,
+        use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE,
+    )
+
+
 _TP: Optional[GroupCoordinator] = None
 
 
@@ -764,13 +786,7 @@ def init_distributed_environment(
     global _WORLD
     if _WORLD is None:
         ranks = list(range(torch.distributed.get_world_size()))
-        _WORLD = GroupCoordinator(
-            group_ranks=[ranks],
-            local_rank=local_rank,
-            torch_distributed_backend=backend,
-            use_pynccl=False,
-            use_custom_allreduce=False,
-        )
+        _WORLD = init_world_group(ranks, local_rank, backend)
     else:
         assert _WORLD.world_size == torch.distributed.get_world_size(), (
             "world group already initialized with a different world size")
@@ -827,13 +843,8 @@ def initialize_model_parallel(
             range(i * tensor_model_parallel_size,
                   (i + 1) * tensor_model_parallel_size))
         group_ranks.append(ranks)
-    _TP = GroupCoordinator(
-        group_ranks=group_ranks,
-        local_rank=get_world_group().local_rank,
-        torch_distributed_backend=backend,
-        use_pynccl=True,
-        use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE,
-    )
+    _TP = init_model_parallel_group(group_ranks,
+                                    get_world_group().local_rank, backend)
 
     # Build the pipeline model-parallel groups.
     num_pipeline_model_parallel_groups: int = (world_size //
@@ -845,13 +856,8 @@ def initialize_model_parallel(
     for i in range(num_pipeline_model_parallel_groups):
         ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
         group_ranks.append(ranks)
-    _PP = GroupCoordinator(
-        group_ranks=group_ranks,
-        local_rank=get_world_group().local_rank,
-        torch_distributed_backend=backend,
-        use_pynccl=True,
-        use_custom_allreduce=_ENABLE_CUSTOM_ALL_REDUCE,
-    )
+    _PP = init_model_parallel_group(group_ranks,
+                                    get_world_group().local_rank, backend)
 
 
 def ensure_model_parallel_initialized(
@@ -887,6 +893,34 @@ def model_parallel_is_initialized():
     return (_TP is not None and _PP is not None)
 
 
+_TP_STATE_PATCHED = False
+
+
+@contextmanager
+def patch_tensor_parallel_group(tp_group: GroupCoordinator):
+    """Patch the tp group temporarily until this function ends.
+
+    This method is for draft workers of speculative decoding to run draft model
+    with different tp degree from that of target model workers.
+
+    Args:
+        tp_group (GroupCoordinator): the tp group coordinator
+    """
+    global _TP_STATE_PATCHED
+    assert not _TP_STATE_PATCHED, "Should not call when it's already patched"
+
+    _TP_STATE_PATCHED = True
+    old_tp_group = get_tp_group()
+    global _TP
+    _TP = tp_group
+    try:
+        yield
+    finally:
+        # restore the original state
+        _TP_STATE_PATCHED = False
+        _TP = old_tp_group
+
+
 def get_tensor_model_parallel_world_size():
     """Return world size for the tensor model parallel group."""
     return get_tp_group().world_size