Fix that the insufficient output HBM buffer init would cause the <unk…

…> token generated for quantized int8 model. PiperOrigin-RevId: 631235764
jax-ml · May 7, 2024 · 4de3464 · jon-chuang · May 12, 2024 · 4de3464
1 parent eee2783
commit 4de3464
Showing 1 changed file with 6 additions and 0 deletions.
diff --git a/jax/experimental/pallas/ops/tpu/paged_attention/paged_attention_kernel.py b/jax/experimental/pallas/ops/tpu/paged_attention/paged_attention_kernel.py
@@ -256,6 +256,12 @@ def paged_flash_attention_kernel_inline_seq_dim(
 ):
   core_index, b, h = pl.program_id(0), pl.program_id(1), pl.program_id(2)
 
+  # Initialize the output HBM buffers to avoid accessing garbage memory inside
+  # the kernel body below.
+  m_ref[...] = jnp.full_like(m_ref, -jnp.inf)
+  l_ref[...] = jnp.zeros_like(l_ref)
+  o_ref[...] = jnp.zeros_like(o_ref)
+
   def body(i, _):
     paged_flash_attention_kernel(
         lengths_ref,