vllm-project · robertgshaw2-neuralmagic · Apr 7, 2024 · Apr 7, 2024 · Apr 7, 2024 · Apr 7, 2024
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -788,6 +788,53 @@ def list_loras(self) -> Set[int]:
             raise RuntimeError("LoRA is not enabled.")
         return self.lora_manager.list_loras()
 
+    @torch.inference_mode()
+    def warmup_prefix_attn(self, kv_caches: List[torch.Tensor]) -> None:
+        """Prefix attention uses a triton jit.
+
+        In our profile_run() step, we profile with random data, so the case
+        with a cache hit is not executed. The triton JIT is generated on
+        the fly, so the first call to context_attention_fwd will take
+        ~3s to process. Without this warmup, this JIT will occur on the hot
+        path.
+
+        In this case, we make 2 sequences. Sequence 0 runs prompt_fwd with
+        self.block_size + 1 tokens, filling up physical block 1. Sequence
+        1 then runs the same prompt, but with metadata that block 1 is 
+        computed. This thus triggers context_attention_fwd and generates
+        the code.
+        """
+        NUM_ITERATIONS = 10
+        NUM_BLOCKS = 10
+        NUM_COMPUTED_BLOCKS = NUM_BLOCKS - 1
+        prompt_tokens = list(range(self.block_size * NUM_BLOCKS + 1))
+        block_table = list(range(1, NUM_BLOCKS + 2))
+
+        # Prompt forward to fill up the KV cache for block 1.
+        request_0 = SequenceGroupMetadata(
+            request_id="first_request",
+            is_prompt=True,
+            seq_data={0: SequenceData(prompt_tokens)},
+            sampling_params=SamplingParams(temperature=0),
+            block_tables={0: block_table},
+        )
+        self.execute_model([request_0], kv_caches)
+
+        # Prompt forward with block 1 computed. (Triggers
+        # context_attention_fwd).
+        request_1 = SequenceGroupMetadata(
+            request_id="second_request",
+            is_prompt=True,
+            seq_data={0: SequenceData(prompt_tokens)},
+            sampling_params=SamplingParams(temperature=0),
+            block_tables={0: block_table},
+            computed_block_nums=block_table[:NUM_COMPUTED_BLOCKS],
+        )
+        for _ in range(NUM_ITERATIONS):
+            self.execute_model([request_1], kv_caches)
+
+        return
+
     @torch.inference_mode()
     def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
         """Cuda graph capture a model.

@@ -165,6 +165,8 @@ def init_cache_engine(self, cache_config: CacheConfig) -> None:
     def warm_up_model(self) -> None:
         if not self.model_config.enforce_eager:
             self.model_runner.capture_model(self.gpu_cache)
+        if self.cache_config.enable_prefix_caching:
+            self.model_runner.warmup_prefix_attn(self.gpu_cache)
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)