Skip to content

Commit

Permalink
rebase
Browse files Browse the repository at this point in the history
  • Loading branch information
Varun Sundar Rabindranath committed Sep 24, 2024
1 parent b67f107 commit 696fb6d
Showing 1 changed file with 5 additions and 4 deletions.
9 changes: 5 additions & 4 deletions vllm/worker/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,8 +743,8 @@ def _cuda_graph_pad_size(self,
num_seqs: int,
max_decode_seq_len: int,
max_encoder_seq_len: int = 0) -> int:
is_mscp: bool = \
self.runner.scheduler_config.is_multi_step_chunked_prefill
is_mscp: bool = self.runner.scheduler_config.is_multi_step and \
self.runner.scheduler_config.chunked_prefill_enabled
# In multi-step chunked-prefill, starting from the second step
# all the sequences are guaranteed to be decodes. So, we may
# run the first-step in eager mode and the rest of the steps
Expand Down Expand Up @@ -859,9 +859,10 @@ def build(self) -> ModelInputForGPU:

# Attention metadata.
# TODO (varun) : Handle flashinfer unsupported
is_mscp: bool = self.scheduler_config.is_multi_step and \
self.scheduler_config.chunked_prefill_enabled
use_graph_block_tables = cuda_graph_pad_size != -1 or \
(self.scheduler_config.is_multi_step_chunked_prefill and \
len(seq_lens) in _BATCH_SIZES_TO_CAPTURE)
(is_mscp and len(seq_lens) in _BATCH_SIZES_TO_CAPTURE)
attn_metadata = self.attn_metadata_builder.build(
seq_lens,
query_lens,
Expand Down

0 comments on commit 696fb6d

Please sign in to comment.