From ee0f6c33c9be851b6d254620f70e30e027f527ea Mon Sep 17 00:00:00 2001
From: songhappy <guoqiongsong@gmail.com>
Date: Thu, 20 Jun 2024 17:33:47 -0700
Subject: [PATCH] comments

---
 .../llm/src/ipex_llm/transformers/models/llama.py   | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/python/llm/src/ipex_llm/transformers/models/llama.py b/python/llm/src/ipex_llm/transformers/models/llama.py
index f73b3b2b733..a53571e538c 100644
--- a/python/llm/src/ipex_llm/transformers/models/llama.py
+++ b/python/llm/src/ipex_llm/transformers/models/llama.py
@@ -2512,10 +2512,15 @@ def llama_model_forward_4_41_internal(
     if output_hidden_states:
         all_hidden_states += (hidden_states,)
 
-    next_cache = next_decoder_cache if use_cache else None
-    if return_legacy_cache:
-        next_cache = next_cache.to_legacy_cache()
-
+    next_cache = None
+    from ipex_llm.transformers.kv import DynamicFp8Cache
+    if use_cache:
+        next_cache = (
+            next_decoder_cache.to_legacy_cache()
+            if not isinstance(next_decoder_cache, DynamicFp8Cache)
+            else next_decoder_cache
+        )
+        
     if not return_dict:
         return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
                      if v is not None)