diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py
index 9c55a1d5382..ac86aa269cc 100644
--- a/python/llm/src/ipex_llm/transformers/npu_model.py
+++ b/python/llm/src/ipex_llm/transformers/npu_model.py
@@ -239,6 +239,8 @@ def optimize_npu_model(cls, *args, **kwargs):
         inter_pp = kwargs.pop("inter_pp", None)
         intra_pp = kwargs.pop("intra_pp", None)
         transpose_value_cache = kwargs.pop("transpose_value_cache", True)
+        compile_full_model = kwargs.pop('compile_full_model', False)
+        save_directory = kwargs.pop('save_directory', None)
 
         if hasattr(model, "llm"):
             llm = model.llm
diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py
index 4834e1de45b..8806e64d1c9 100644
--- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py
+++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py
@@ -56,7 +56,7 @@ def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True
             f.write(model_stream)
 
     os.remove(xml_path)
-    os.remove(bin_path)
+    # os.remove(bin_path)
 
     if not keep_ir:
         os.remove(new_ir_path)
diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py
index 31be6e2a1e5..bc1acf3f21c 100644
--- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py
+++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py
@@ -350,9 +350,9 @@ def convert_llm(model: torch.nn.Module,
                 result = pool.starmap(convert_qwen_layer, param_list)
             
             if compile_full_model:
-                from .qwen import convert_qwen_prefill_layer, convert_lm_head_and_embedding
-                convert_qwen_prefill_layer(model, n_splits_linear, n_splits_down_proj,
-                                           temp_dir, weight_dir, transpose_value_cache, max_prompt_len, group_size)
+                convert_qwen_layer(model, 0, n_splits_linear, n_splits_down_proj,
+                                   temp_dir, weight_dir, transpose_value_cache, max_prompt_len, group_size,
+                                   "prefill")
                 convert_lm_head_and_embedding(model, n_splits_linear,
                                               temp_dir, weight_dir, max_prompt_len)
 
diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py
index 92e574fb23a..e82a0ac18e6 100644
--- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py
+++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py
@@ -85,18 +85,19 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
     )
     suffix = "_prefill" if input_length > 1 else ""
     compile = False if input_length > 1 else True
-    if input_length == 0:
+    if input_length == 1:
         first_blob_path = update_names_of_IR_and_export_blob(new_embedding, f"embedding{suffix}",
                                                              temp_dir, compile, keep_ir=False)
-    if input_length > 1:
+    else:
         bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
         embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
+        first_blob_path = None
     return first_blob_path, last_blob_path
 
 
 def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
                        temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
-                       layernorm_const):
+                       layernorm_const, mode="decode"):
     num_heads = model.model.layers[0].self_attn.num_heads
     num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
     head_dim = model.model.layers[0].self_attn.head_dim
@@ -133,162 +134,84 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
     else:  # FP16 Linear
         np_dtype = np.float16
 
-    single_decoder = LowBitQwenMultiDecoderlayer(
-        [1, 1, num_heads * head_dim],
-        input_layernorm_weights=[layer_norm_0] if layernorm_const else None,
-        post_attn_layernorm_weights=[layer_norm_1] if layernorm_const else None,
-        q_biases=None,
-        k_biases=None,
-        v_biases=None,
-        cached_cos=cached_cos,
-        cached_sin=cached_sin,
-        num_heads=num_heads,
-        num_key_value_heads=num_key_value_heads,
-        num_layers=1,
-        max_seq_len=kv_len,
-        rms_norm_eps=rms_norm_eps,
-        intermediate_size=intermediate_size,
-        mode="decode",
-        transpose_value=transpose_value_cache,
-        dtype=np_dtype,
-        n_splits_linear=n_splits_linear,
-        n_splits_down_proj=n_splits_down_proj,
-        group_size=group_size
-    )
-    rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
-                                                        f"decoder_layer_{layer_idx}",
-                                                        temp_dir)
-    bin_path = os.path.join(temp_dir, f"decoder_layer_{layer_idx}" + ".bin")
-    os.remove(bin_path)
-
-    # 0, 1, 2 are input_embed/attention_mask/position_id
-    if layernorm_const:
-        st_idx = 3
-    else:
-        input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
-        post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
-        layer_norm_0.data.numpy().tofile(input_lm_bin_file)
-        layer_norm_1.data.numpy().tofile(post_lm_bin_file)
-        st_idx = 5
-    q_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx}.bin")
-    k_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+1}.bin")
-    v_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+2}.bin")
-    q_bias.data.numpy().tofile(q_bias_bin_file)
-    k_bias.data.numpy().tofile(k_bias_bin_file)
-    v_bias.data.numpy().tofile(v_bias_bin_file)
-    # 6, 7 are past k/v
-    for idx, (weight, scale) in enumerate(weights):
-        bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+5+idx*2}.bin")
-        weight.numpy().tofile(bin_file)
-        bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+5+idx*2+1}.bin")
-        scale.numpy().tofile(bin_file)
-
-    del single_decoder
-
-
-def convert_qwen_prefill_layer(model, n_splits_linear, n_splits_down_proj,
-                               temp_dir, weight_dir, transpose_value_cache, kv_len, group_size):
-    layer_idx = 0
-    num_heads = model.model.layers[0].self_attn.num_heads
-    num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
-    head_dim = model.model.layers[0].self_attn.head_dim
-    intermediate_size = model.config.intermediate_size
-    rms_norm_eps = model.config.rms_norm_eps
-
-    from ipex_llm.transformers.npu_models.qwen2_mp import LowBitQwenMultiDecoderlayer
-    curr_layer = model.model.layers[layer_idx]
-    attn_layer = curr_layer.self_attn
-    mlp_layer = curr_layer.mlp
-
-    weights = []
-    if n_splits_linear == 1:
-        for q, k, v, o, g, u in zip(attn_layer.q_proj_dq_list,
-                                    attn_layer.k_proj_dq_list,
-                                    attn_layer.v_proj_dq_list,
-                                    attn_layer.o_proj_dq_list,
-                                    mlp_layer.gate_proj_dq_list,
-                                    mlp_layer.up_proj_dq_list):
-            weights.append((q.weight, q.scale))
-            weights.append((k.weight, k.scale))
-            weights.append((v.weight, v.scale))
-            weights.append((o.weight, o.scale))
-            weights.append((g.weight, g.scale))
-            weights.append((u.weight, u.scale))
+    if mode == "decode":
+        single_decoder = LowBitQwenMultiDecoderlayer(
+            [1, 1, num_heads * head_dim],
+            input_layernorm_weights=[layer_norm_0] if layernorm_const else None,
+            post_attn_layernorm_weights=[layer_norm_1] if layernorm_const else None,
+            q_biases=None,
+            k_biases=None,
+            v_biases=None,
+            cached_cos=cached_cos,
+            cached_sin=cached_sin,
+            num_heads=num_heads,
+            num_key_value_heads=num_key_value_heads,
+            num_layers=1,
+            max_seq_len=kv_len,
+            rms_norm_eps=rms_norm_eps,
+            intermediate_size=intermediate_size,
+            mode="decode",
+            transpose_value=transpose_value_cache,
+            dtype=np_dtype,
+            n_splits_linear=n_splits_linear,
+            n_splits_down_proj=n_splits_down_proj,
+            group_size=group_size
+        )
     else:
-        for layer_list in [attn_layer.q_proj_dq_list, attn_layer.k_proj_dq_list,
-                           attn_layer.v_proj_dq_list, attn_layer.o_proj_dq_list,
-                           mlp_layer.gate_proj_dq_list, mlp_layer.up_proj_dq_list]:
-            l_weights = []
-            scales = []
-            for l in layer_list:
-                l_weights.append(l.weight)
-                scales.append(l.scale)
-            weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0)))
-
-    if n_splits_down_proj == 1:
-        for l in mlp_layer.down_proj_dq_list:
-            weights.append((l.weight, l.scale))
+        single_decoder = LowBitQwenMultiDecoderlayer(
+            [1, kv_len, num_heads * head_dim],
+            input_layernorm_weights=None,
+            post_attn_layernorm_weights=None,
+            q_biases=None,
+            k_biases=None,
+            v_biases=None,
+            cached_cos=cached_cos,
+            cached_sin=cached_sin,
+            num_heads=num_heads,
+            num_key_value_heads=num_key_value_heads,
+            num_layers=1,
+            max_seq_len=kv_len,
+            rms_norm_eps=rms_norm_eps,
+            intermediate_size=intermediate_size,
+            mode="prefill",
+            transpose_value=transpose_value_cache,
+            dtype=np_dtype,
+            n_splits_linear=n_splits_linear,
+            n_splits_down_proj=n_splits_down_proj,
+            group_size=group_size
+        )
+    if mode == "decode":
+        rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
+                                                            f"decoder_layer_{layer_idx}",
+                                                            temp_dir)
     else:
-        l_weights = []
-        scales = []
-        for l in mlp_layer.down_proj_dq_list:
-            l_weights.append(l.weight)
-            scales.append(l.scale)
-        weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0)))
-
-    q_bias = attn_layer.q_proj_dq_list.q_proj_dq_0.bias.to(torch.float16)
-    k_bias = attn_layer.k_proj_dq_list.k_proj_dq_0.bias.to(torch.float16)
-    v_bias = attn_layer.v_proj_dq_list.v_proj_dq_0.bias.to(torch.float16)
-    cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
-    cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
-    layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16)
-    layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16)
-
-    if isinstance(weights[0], tuple):
-        np_dtype = np.int8 if weights[0][0].dtype == torch.int8 else np.uint8
-    else:  # FP16 Linear
-        np_dtype = np.float16
-
-    single_decoder = LowBitQwenMultiDecoderlayer(
-        [1, kv_len, num_heads * head_dim],
-        input_layernorm_weights=None,
-        post_attn_layernorm_weights=None,
-        q_biases=None,
-        k_biases=None,
-        v_biases=None,
-        cached_cos=cached_cos,
-        cached_sin=cached_sin,
-        num_heads=num_heads,
-        num_key_value_heads=num_key_value_heads,
-        num_layers=1,
-        max_seq_len=kv_len,
-        rms_norm_eps=rms_norm_eps,
-        intermediate_size=intermediate_size,
-        mode="prefill",
-        transpose_value=transpose_value_cache,
-        dtype=np_dtype,
-        n_splits_linear=n_splits_linear,
-        n_splits_down_proj=n_splits_down_proj,
-        group_size=group_size
-    )
-    rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
-                                                        "decoder_layer_prefill",
-                                                        temp_dir,
-                                                        False)
-    bin_path = os.path.join(temp_dir, "decoder_layer_prefill" + ".bin")
+        rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
+                                                            f"decoder_layer_prefill",
+                                                            temp_dir)
+    bin_path = os.path.join(temp_dir, f"decoder_layer_{layer_idx}" + ".bin")
     os.remove(bin_path)
 
     # 0, 1, 2 are input_embed/attention_mask/position_id
-    input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
-    post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
-    layer_norm_0.data.numpy().tofile(input_lm_bin_file)
-    layer_norm_1.data.numpy().tofile(post_lm_bin_file)
-    st_idx = 5
-    q_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx}.bin")
-    k_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+1}.bin")
-    v_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+2}.bin")
-    q_bias.data.numpy().tofile(q_bias_bin_file)
-    k_bias.data.numpy().tofile(k_bias_bin_file)
-    v_bias.data.numpy().tofile(v_bias_bin_file)
+    if mode == "decode":
+        if layernorm_const:
+            st_idx = 3
+        else:
+            input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
+            post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
+            layer_norm_0.data.numpy().tofile(input_lm_bin_file)
+            layer_norm_1.data.numpy().tofile(post_lm_bin_file)
+            st_idx = 5
+        q_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx}.bin")
+        k_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+1}.bin")
+        v_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+2}.bin")
+        q_bias.data.numpy().tofile(q_bias_bin_file)
+        k_bias.data.numpy().tofile(k_bias_bin_file)
+        v_bias.data.numpy().tofile(v_bias_bin_file)
+        # 6, 7 are past k/v
+        for idx, (weight, scale) in enumerate(weights):
+            bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+5+idx*2}.bin")
+            weight.numpy().tofile(bin_file)
+            bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+5+idx*2+1}.bin")
+            scale.numpy().tofile(bin_file)
 
     del single_decoder