vllm-project · cadedaniel · Jul 1, 2024 · May 29, 2024 · Jun 3, 2024 · Jun 3, 2024
@@ -11,9 +11,15 @@
 numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
 equality. This gives us good coverage of temp=0.
 
+At temp=0, the TypicalAcceptanceSampler ensures that only the tokens with the
+highest probability in the target distribution are accepted. Therefore, we can 
+expect greedy equality for the TypicalAcceptanceSampler at temp=0.
+
 For temp>0, we rely on unit tests on the rejection sampler to verify that the
 output distribution is the same with spec decode vs. no spec decode (this would
-be prohibitively expensive to run with a real model).
+be prohibitively expensive to run with a real model). Similary, for the
+TypicalAcceptance sampler also, we rely on unit tests to validate temp>0
+test cases.
 
 NOTE: Speculative decoding's distribution equality requires that the measured
 distributions of the target model and proposal model be deterministic given the
@@ -177,7 +183,9 @@ def test_spec_decode_e2e_with_async_engine(test_llm_generator,
     {
         "speculative_model": "JackFram/llama-68m",
         "num_speculative_tokens": 5,
-    },
+        "speculative_draft_token_sampling_method": method
+    }
+    for method in ["rejection_sampler", "typical_acceptance_sampler"]
 ])
 @pytest.mark.parametrize(
     "output_len",
@@ -230,8 +238,10 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
         "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
-    },
+        "num_speculative_tokens": 3,
+        "speculative_draft_token_sampling_method": method
+    }
+    for method in ["rejection_sampler", "typical_acceptance_sampler"]
 ])
 @pytest.mark.parametrize(
     "output_len",
@@ -279,7 +289,9 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
     {
         "speculative_model": "JackFram/llama-68m",
         "num_speculative_tokens": 5,
-    },
+        "speculative_draft_token_sampling_method": method
+    }
+    for method in ["rejection_sampler", "typical_acceptance_sampler"]
 ])
 @pytest.mark.parametrize("max_output_len", [
     256,
@@ -320,7 +332,9 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
     {
         "speculative_model": "JackFram/llama-68m",
         "num_speculative_tokens": 5,
-    },
+        "speculative_draft_token_sampling_method": method
+    }
+    for method in ["rejection_sampler", "typical_acceptance_sampler"]
 ])
 @pytest.mark.parametrize("batch_size", [1])
 @pytest.mark.parametrize(
@@ -364,7 +378,9 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
     {
         "speculative_model": "JackFram/llama-68m",
         "num_speculative_tokens": 5,
-    },
+        "speculative_draft_token_sampling_method": method
+    }
+    for method in ["rejection_sampler", "typical_acceptance_sampler"]
 ])
 @pytest.mark.parametrize("batch_size", [32])
 @pytest.mark.parametrize(
@@ -411,7 +427,9 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
     {
         "speculative_model": "JackFram/llama-68m",
         "num_speculative_tokens": 5,
-    },
+        "speculative_draft_token_sampling_method": method
+    }
+    for method in ["rejection_sampler", "typical_acceptance_sampler"]
 ])
 @pytest.mark.parametrize(
     "output_len",
@@ -465,7 +483,9 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
     {
         "speculative_model": "JackFram/llama-68m",
         "num_speculative_tokens": 5,
-    },
+        "speculative_draft_token_sampling_method": method
+    }
+    for method in ["rejection_sampler", "typical_acceptance_sampler"]
 ])
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize(
@@ -506,11 +526,13 @@ def test_spec_decode_different_block_size(baseline_llm_generator,
         {
             "speculative_model": "JackFram/llama-68m",
             "num_speculative_tokens": 5,
-
+            
             # Artificially limit the draft model max model len; this forces vLLM
             # to skip speculation once the sequences grow beyond 32-k tokens.
             "speculative_max_model_len": 32,
-        },
+            "speculative_draft_token_sampling_method": method
+        }
+        for method in ["rejection_sampler", "typical_acceptance_sampler"]
     ])
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize(
@@ -554,7 +576,9 @@ def test_skip_speculation(baseline_llm_generator, test_llm_generator,
         "speculative_model": "JackFram/llama-68m",
         "num_speculative_tokens": 5,
         "speculative_disable_by_batch_size": 2,
-    },
+        "speculative_draft_token_sampling_method": method
+    }
+    for method in ["rejection_sampler", "typical_acceptance_sampler"]
 ])
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("output_len", [10])
@@ -589,9 +613,12 @@ def test_disable_speculation(baseline_llm_generator, test_llm_generator,
         {
             "speculative_model": "JackFram/llama-68m",
             "num_speculative_tokens": k,
+            "speculative_draft_token_sampling_method": method,
         }
         # Try a range of common k, as well as large speculation.
         for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63]
+        # Try both methods of sampling in the verifier.
+        for method in ["rejection_sampler", "typical_acceptance_sampler"]
     ])
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize(

@@ -3,33 +3,33 @@
 import pytest
 import torch
 
-from vllm.model_executor.layers.rejection_sampler import RejectionSampler
 from vllm.sequence import ExecuteModelRequest
 from vllm.spec_decode.metrics import AsyncMetricsCollector
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker
 from vllm.spec_decode.top1_proposer import Top1Proposer
 
 from .utils import create_batch, mock_worker
-
+from .test_utils import mock_spec_decode_sampler
 
 @pytest.mark.parametrize('queue_size', [4])
 @pytest.mark.parametrize('batch_size', [1])
 @pytest.mark.parametrize('k', [1])
+@pytest.mark.parametrize("mock_spec_decode_sampler",
+  ["rejection_sampler", "typical_acceptance_sampler"], indirect=True)
 @torch.inference_mode()
-def test_disable_spec_tokens(queue_size: int, batch_size: int, k: int):
+def test_disable_spec_tokens(
+    queue_size: int, batch_size: int, k: int, mock_spec_decode_sampler):
     """Verify that speculative tokens are disabled when the batch size
     exceeds the threshold.
     """
     disable_by_batch_size = 3
-
     draft_worker = mock_worker(cls=MultiStepWorker)
     target_worker = mock_worker()
-    rejection_sampler = MagicMock(spec=RejectionSampler)
     metrics_collector = MagicMock(spec=AsyncMetricsCollector)
     worker = SpecDecodeWorker(proposer_worker=draft_worker,
                               scorer_worker=target_worker,
-                              rejection_sampler=rejection_sampler,
+                              spec_decode_sampler=mock_spec_decode_sampler,
                               metrics_collector=metrics_collector,
                               disable_by_batch_size=disable_by_batch_size)
 

@@ -10,16 +10,16 @@
 def test_initial_call_returns_none():
     """Expect first call to get metrics to return None.
     """
-    rej_sampler = MagicMock()
-    rej_sampler.num_accepted_tokens = torch.tensor(0,
-                                                   dtype=torch.long,
-                                                   device='cuda')
-    rej_sampler.num_emitted_tokens = torch.tensor(0,
-                                                  dtype=torch.long,
-                                                  device='cuda')
-    rej_sampler.num_draft_tokens = 0
-
-    collector = AsyncMetricsCollector(rej_sampler)
+    spec_decode_sampler = MagicMock()
+    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
+                                                           dtype=torch.long,
+                                                           device='cuda')
+    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
+                                                          dtype=torch.long,
+                                                          device='cuda')
+    spec_decode_sampler.num_draft_tokens = 0
+
+    collector = AsyncMetricsCollector(spec_decode_sampler)
     collector.init_gpu_tensors(rank=0)
     maybe_metrics = collector.maybe_collect_rejsample_metrics(k=5)
     assert maybe_metrics is None
@@ -28,22 +28,22 @@ def test_initial_call_returns_none():
 def test_second_call_returns_metrics():
     """Expect second call to not return None.
     """
-    rej_sampler = MagicMock()
-    rej_sampler.num_accepted_tokens = torch.tensor(0,
-                                                   dtype=torch.long,
-                                                   device='cuda')
-    rej_sampler.num_emitted_tokens = torch.tensor(0,
-                                                  dtype=torch.long,
-                                                  device='cuda')
-    rej_sampler.num_draft_tokens = 0
+    spec_decode_sampler = MagicMock()
+    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
+                                                           dtype=torch.long,
+                                                           device='cuda')
+    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
+                                                          dtype=torch.long,
+                                                          device='cuda')
+    spec_decode_sampler.num_draft_tokens = 0
 
     collect_interval_s = 5.0
     timer = MagicMock()
     timer.side_effect = [
         0.0, collect_interval_s + 0.1, collect_interval_s + 0.2
     ]
 
-    collector = AsyncMetricsCollector(rejection_sampler=rej_sampler,
+    collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler,
                                       timer=timer,
                                       collect_interval_s=collect_interval_s)
     collector.init_gpu_tensors(rank=0)
@@ -56,16 +56,16 @@ def test_second_call_returns_metrics():
 def test_nonzero_rank_noop(rank):
     """Verify nonzero ranks don't collect metrics.
     """
-    rej_sampler = MagicMock()
-    rej_sampler.num_accepted_tokens = torch.tensor(0,
-                                                   dtype=torch.long,
-                                                   device='cuda')
-    rej_sampler.num_emitted_tokens = torch.tensor(0,
-                                                  dtype=torch.long,
-                                                  device='cuda')
-    rej_sampler.num_draft_tokens = 0
-
-    collector = AsyncMetricsCollector(rej_sampler)
+    spec_decode_sampler = MagicMock()
+    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
+                                                           dtype=torch.long,
+                                                           device='cuda')
+    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
+                                                          dtype=torch.long,
+                                                          device='cuda')
+    spec_decode_sampler.num_draft_tokens = 0
+
+    collector = AsyncMetricsCollector(spec_decode_sampler)
     collector.init_gpu_tensors(rank=rank)
     _ = collector.maybe_collect_rejsample_metrics(k=5)
     metrics = collector.maybe_collect_rejsample_metrics(k=5)
@@ -75,14 +75,14 @@ def test_nonzero_rank_noop(rank):
 def test_noop_until_time():
     """Verify metrics aren't collected until enough time passes.
     """
-    rej_sampler = MagicMock()
-    rej_sampler.num_accepted_tokens = torch.tensor(0,
-                                                   dtype=torch.long,
-                                                   device='cuda')
-    rej_sampler.num_emitted_tokens = torch.tensor(0,
-                                                  dtype=torch.long,
-                                                  device='cuda')
-    rej_sampler.num_draft_tokens = 0
+    spec_decode_sampler = MagicMock()
+    spec_decode_sampler.num_accepted_tokens = torch.tensor(0,
+                                                           dtype=torch.long,
+                                                           device='cuda')
+    spec_decode_sampler.num_emitted_tokens = torch.tensor(0,
+                                                          dtype=torch.long,
+                                                          device='cuda')
+    spec_decode_sampler.num_draft_tokens = 0
 
     collect_interval_s = 5.0
     timer = MagicMock()
@@ -91,7 +91,7 @@ def test_noop_until_time():
         collect_interval_s + 0.1, collect_interval_s + 0.1
     ]
 
-    collector = AsyncMetricsCollector(rejection_sampler=rej_sampler,
+    collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler,
                                       timer=timer,
                                       collect_interval_s=collect_interval_s)
     collector.init_gpu_tensors(rank=0)
@@ -122,22 +122,22 @@ def test_initial_metrics_has_correct_values(has_data: bool):
     max_num_emitted_tokens = AsyncMetricsCollector.get_max_num_emitted_tokens(
         num_draft_tokens, k)
 
-    rej_sampler = MagicMock()
-    rej_sampler.num_accepted_tokens = torch.tensor(num_accepted_tokens,
-                                                   dtype=torch.long,
-                                                   device='cuda')
-    rej_sampler.num_emitted_tokens = torch.tensor(num_emitted_tokens,
+    spec_decode_sampler = MagicMock()
+    spec_decode_sampler.num_accepted_tokens = torch.tensor(num_accepted_tokens,
+                                                           dtype=torch.long,
+                                                           device='cuda')
+    spec_decode_sampler.num_emitted_tokens = torch.tensor(num_emitted_tokens,
                                                   dtype=torch.long,
                                                   device='cuda')
-    rej_sampler.num_draft_tokens = num_draft_tokens
+    spec_decode_sampler.num_draft_tokens = num_draft_tokens
 
     collect_interval_s = 5.0
     timer = MagicMock()
     timer.side_effect = [
         0.0, collect_interval_s + 0.1, collect_interval_s + 0.2
     ]
 
-    collector = AsyncMetricsCollector(rejection_sampler=rej_sampler,
+    collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler,
                                       timer=timer,
                                       collect_interval_s=collect_interval_s)
     collector.init_gpu_tensors(rank=0)