rebase

vllm-project · Sep 24, 2024 · 696fb6d · 696fb6d
1 parent b67f107
commit 696fb6d
Showing 1 changed file with 5 additions and 4 deletions.
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -743,8 +743,8 @@ def _cuda_graph_pad_size(self,
                              num_seqs: int,
                              max_decode_seq_len: int,
                              max_encoder_seq_len: int = 0) -> int:
-        is_mscp: bool = \
-            self.runner.scheduler_config.is_multi_step_chunked_prefill
+        is_mscp: bool = self.runner.scheduler_config.is_multi_step and \
+                    self.runner.scheduler_config.chunked_prefill_enabled
         # In multi-step chunked-prefill, starting from the second step
         # all the sequences are guaranteed to be decodes. So, we may
         # run the first-step in eager mode and the rest of the steps
@@ -859,9 +859,10 @@ def build(self) -> ModelInputForGPU:
 
         # Attention metadata.
         # TODO (varun) : Handle flashinfer unsupported
+        is_mscp: bool = self.scheduler_config.is_multi_step and \
+                        self.scheduler_config.chunked_prefill_enabled
         use_graph_block_tables = cuda_graph_pad_size != -1 or \
-            (self.scheduler_config.is_multi_step_chunked_prefill and \
-             len(seq_lens) in _BATCH_SIZES_TO_CAPTURE)
+            (is_mscp and len(seq_lens) in _BATCH_SIZES_TO_CAPTURE)
         attn_metadata = self.attn_metadata_builder.build(
             seq_lens,
             query_lens,