From ff340b5c4388016070f980dfd569627b4a471030 Mon Sep 17 00:00:00 2001
From: rnwang04 <ruonan1.wang@intel.com>
Date: Tue, 3 Dec 2024 15:35:47 +0800
Subject: [PATCH 01/21] initiail support of q4_1

---
 python/llm/src/ipex_llm/ggml/quantize.py      |  1 +
 .../ipex_llm/transformers/low_bit_linear.py   | 16 +++++++++++-----
 .../src/ipex_llm/transformers/npu_model.py    |  1 +
 .../transformers/npu_models/convert.py        | 19 ++++++++++++++++---
 .../transformers/npu_models/linear.py         | 15 +++++++++++++++
 5 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/python/llm/src/ipex_llm/ggml/quantize.py b/python/llm/src/ipex_llm/ggml/quantize.py
index 76702e88117..a95e3464e32 100644
--- a/python/llm/src/ipex_llm/ggml/quantize.py
+++ b/python/llm/src/ipex_llm/ggml/quantize.py
@@ -52,6 +52,7 @@
                      "fp6_k": 30,
                      "sym_int4_rtn": 31,
                      "sym_int8_rtn": 32,
+                     "asym_int4_rtn": 33,
                      }
 
 # mixed precison from llama.cpp
diff --git a/python/llm/src/ipex_llm/transformers/low_bit_linear.py b/python/llm/src/ipex_llm/transformers/low_bit_linear.py
index 82fbdf6f506..ed44140d708 100644
--- a/python/llm/src/ipex_llm/transformers/low_bit_linear.py
+++ b/python/llm/src/ipex_llm/transformers/low_bit_linear.py
@@ -84,8 +84,10 @@
 FP6_K = ggml_tensor_qtype["fp6_k"]
 SYM_INT4_RTN = ggml_tensor_qtype["sym_int4_rtn"]
 SYM_INT8_RTN = ggml_tensor_qtype["sym_int8_rtn"]
+ASYM_INT4_RTN = ggml_tensor_qtype["asym_int4_rtn"]
 RTN_DTYPE = {
     SYM_INT4_RTN: torch.uint8,
+    ASYM_INT4_RTN: torch.uint8,
     SYM_INT8_RTN: torch.int8,
 }
 
@@ -223,12 +225,16 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int,
                       f"Last dim of input tensor must be multiple of {QK}")
 
     dst_size = (n // QK) * block_size_in_bytes
-    if qtype in [SYM_INT8_RTN, SYM_INT4_RTN]:
+    if qtype in [SYM_INT8_RTN, SYM_INT4_RTN, ASYM_INT4_RTN]:
         dst_tensor = torch.empty(dst_size, dtype=RTN_DTYPE[qtype],
                                  device=device)
         dst_tensor = dst_tensor.reshape(tensor.shape[0], tensor.shape[-1] // QK)
-        scale = torch.empty(n // k, dtype=torch.float32,
-                            device=device)
+        if qtype == ASYM_INT4_RTN:
+            scale = torch.empty((n // k) * 2, dtype=torch.float32,
+                                device=device)
+        else:
+            scale = torch.empty(n // k, dtype=torch.float32,
+                                device=device)
     elif qtype == NF4:
         # Deepspeed zero3 requires unified dtype,
         # thus here uses bfloat16 consistent to other layers
@@ -244,7 +250,7 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int,
         dst = ctypes.c_void_p(dst_tensor.data.data_ptr())
         hist = (ctypes.c_int64 * 16)()
         if qtype not in [IQ2_XXS, IQ2_XS, Q2_K, IQ1_S, Q4_K, Q6_K, Q5_K, FP6_K]:
-            if qtype in [SYM_INT8_RTN, SYM_INT4_RTN]:
+            if qtype in [SYM_INT8_RTN, SYM_INT4_RTN, ASYM_INT4_RTN]:
                 scale_ptr = ctypes.cast(scale.data.data_ptr(), ctypes.POINTER(ctypes.c_float))
                 if imatrix is None:
                     ggml.ggml_quantize_tensor_rtn(src, dst, scale_ptr, qtype, n,
@@ -269,7 +275,7 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int,
             ggml.ggml_quantize_tensor_with_weights(src, dst, qtype,
                                                    n // in_features, in_features,
                                                    hist, imatrix)
-    if qtype in [SYM_INT8_RTN, SYM_INT4_RTN]:
+    if qtype in [SYM_INT8_RTN, SYM_INT4_RTN, ASYM_INT4_RTN]:
         return dst_tensor, scale.type(torch.float16)
     else:
         return dst_tensor
diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py
index 9dbbd1b8fde..f1e14a78f89 100644
--- a/python/llm/src/ipex_llm/transformers/npu_model.py
+++ b/python/llm/src/ipex_llm/transformers/npu_model.py
@@ -103,6 +103,7 @@ def from_pretrained(cls, *args, **kwargs):
         qtype_map = {
             "sym_int4": "sym_int4_rtn",
             "sym_int8": "sym_int8_rtn",
+            "asym_int4": "asym_int4_rtn",
         }
 
         invalidInputError(
diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert.py b/python/llm/src/ipex_llm/transformers/npu_models/convert.py
index 9ac0c9a6dda..8d5339575a0 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/convert.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/convert.py
@@ -99,8 +99,12 @@ def replace_with_QuantizedLinear(layer, qtype, device, modules_to_not_convert,
                                              iqtype, device=device,
                                              enable_scale_search=enable_scale_search,
                                              imatrix=imatrix)
-        return QuantizedLinear(qweights, scale, layer.bias,
-                               group_size=group_size)
+        min = None
+        # split scale to scale & min
+        if qtype == "asym_int4_rtn":
+            scale, min = torch.split(scale, scale.shape[0] // 2)
+        return QuantizedLinear(qweights, scale, min, layer.bias,
+                               group_size=group_size, qtype=qtype)
 
 
 @module_optimization
@@ -111,12 +115,21 @@ def replace_with_DequantizedLinear(layer, qtype, device, modules_to_not_convert,
     from ipex_llm.ggml.quantize import ggml_tensor_qtype
     iqtype = ggml_tensor_qtype[qtype]
     if isinstance(layer, torch.nn.Linear) and not hasattr(layer, "qtype"):
+        if qtype == "sym_int4_rtn":
+            # workaround for qwen2-7B & int4
+            if (layer.in_features == 3584 and layer.out_features == 152064):
+                qtype = "sym_int8_rtn"
+                iqtype = ggml_tensor_qtype[qtype]
         enable_scale_search = os.environ.get("IPEX_LLM_NPU_QUANTIZATION_OPT", "0") != "0"
         qweights, scale = ggml_convert_qtype(layer.weight.data.to(torch.float32),
                                              iqtype, device=device,
                                              enable_scale_search=enable_scale_search,
                                              imatrix=imatrix)
-        return DequantizedLinear(qweights, scale, layer.bias)
+        min = None
+        # split scale to scale & min
+        if qtype == "asym_int4_rtn":
+            scale, min = torch.split(scale, scale.shape[0] // 2)
+        return DequantizedLinear(qweights, scale, min, layer.bias, qtype)
 
 
 @module_optimization
diff --git a/python/llm/src/ipex_llm/transformers/npu_models/linear.py b/python/llm/src/ipex_llm/transformers/npu_models/linear.py
index 2c4b5f37738..60ba4b7ba0f 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/linear.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/linear.py
@@ -129,7 +129,9 @@ def __init__(
         self,
         weight: torch.Tensor,
         scale: torch.Tensor,
+        min: Optional[torch.Tensor] = None,
         bias: Optional[torch.Tensor] = None,
+        qtype: Optional[str] = "sym_int4_rtn",
         group_size: int = 0,
     ):
         """Initialize the QuantizedLinear class.
@@ -137,8 +139,10 @@ def __init__(
         Args:
             weight (torch.Tensor): Linear operation weight
             scale (torch.Tensor): Quantization scale
+            min (Optional[torch.Tensor], optional): Quantization min for asym_int4_rtn
             bias (Optional[torch.Tensor], optional): Linear operation optional bias.
                                                      Defaults to None.
+            qtype (Optional[str], optional): qtype of this Linear
 
         Raises:
             RuntimeError: Quantized weight must be in torch.int8 format
@@ -163,6 +167,8 @@ def __init__(
                 self.inC *= 2
             self.scale = Parameter(scale * math.sqrt(self.inC), requires_grad=False)
         self.bias = bias
+        self.min = min
+        self.qtype = qtype
         self.op_id = str(uuid.uuid4())
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -197,6 +203,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         out = run_matmul(x, self.weight.data, self.scale.data, self.op_id)
 
+        if self.qtype == "asym_int4_rtn" and self.min is not None:
+            out = out + self.min
+
         if self.bias is None:
             return out
         return out + self.bias
@@ -209,14 +218,18 @@ def __init__(
         self,
         weight: torch.Tensor,
         scale: torch.Tensor,
+        min: Optional[torch.Tensor] = None,
         bias: Optional[torch.Tensor] = None,
+        qtype: Optional[str] = "sym_int4_rtn",
     ):
         """Initialize the DequantizedLinear class.
         Args:
             weight (torch.Tensor): Linear operation quantized weight
             scale (torch.Tensor): Quantization scale
+            min (Optional[torch.Tensor], optional): Quantization min for asym_int4_rtn
             bias (Optional[torch.Tensor], optional): Linear operation optional bias.
                                                      Defaults to None.
+            qtype (Optional[str], optional): qtype of this Linear
         Raises:
             RuntimeError: Quantized weight must be in torch.int8 format
         """
@@ -240,6 +253,8 @@ def __init__(
             decompressed_weight = combined_weight.view(combined_weight.size(0), -1)
             dequantized_weight = decompressed_weight.to(torch.float32) * \
                 torch.unsqueeze(scale.to(torch.float32), dim=1)
+            if qtype == "asym_int4_rtn" and min is not None:
+                dequantized_weight = dequantized_weight + torch.unsqueeze(min.to(torch.float32), dim=1)
             self.weight = Parameter(dequantized_weight, requires_grad=False).contiguous()
         else:
             dequantized_weight = weight.to(torch.float32) * \

From 1e423049d0c133c1d04aab276bec57f1441c0091 Mon Sep 17 00:00:00 2001
From: rnwang04 <ruonan1.wang@intel.com>
Date: Tue, 3 Dec 2024 16:41:26 +0800
Subject: [PATCH 02/21] fix

---
 python/llm/src/ipex_llm/transformers/npu_models/convert.py | 2 +-
 python/llm/src/ipex_llm/transformers/npu_models/linear.py  | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert.py b/python/llm/src/ipex_llm/transformers/npu_models/convert.py
index 8d5339575a0..8fd0704b194 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/convert.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/convert.py
@@ -115,7 +115,7 @@ def replace_with_DequantizedLinear(layer, qtype, device, modules_to_not_convert,
     from ipex_llm.ggml.quantize import ggml_tensor_qtype
     iqtype = ggml_tensor_qtype[qtype]
     if isinstance(layer, torch.nn.Linear) and not hasattr(layer, "qtype"):
-        if qtype == "sym_int4_rtn":
+        if qtype in ["sym_int4_rtn", "asym_int4_rtn"]:
             # workaround for qwen2-7B & int4
             if (layer.in_features == 3584 and layer.out_features == 152064):
                 qtype = "sym_int8_rtn"
diff --git a/python/llm/src/ipex_llm/transformers/npu_models/linear.py b/python/llm/src/ipex_llm/transformers/npu_models/linear.py
index 60ba4b7ba0f..6eb3835ace8 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/linear.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/linear.py
@@ -245,7 +245,8 @@ def __init__(
             )
 
         if weight.dtype == torch.uint8:
-            weight = weight.view(torch.int8)
+            if qtype == "sym_int_rtn":
+                weight = weight.view(torch.int8)
             high_4bits = weight >> 4
             low_4bits = (weight << 4) >> 4
 

From 77321a736e0d8db6696aab58c043365b0d441cde Mon Sep 17 00:00:00 2001
From: rnwang04 <ruonan1.wang@intel.com>
Date: Tue, 3 Dec 2024 17:29:06 +0800
Subject: [PATCH 03/21] fix

---
 python/llm/src/ipex_llm/transformers/npu_models/linear.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/llm/src/ipex_llm/transformers/npu_models/linear.py b/python/llm/src/ipex_llm/transformers/npu_models/linear.py
index 6eb3835ace8..0b636b098d6 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/linear.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/linear.py
@@ -245,7 +245,7 @@ def __init__(
             )
 
         if weight.dtype == torch.uint8:
-            if qtype == "sym_int_rtn":
+            if qtype == "sym_int4_rtn":
                 weight = weight.view(torch.int8)
             high_4bits = weight >> 4
             low_4bits = (weight << 4) >> 4
@@ -255,7 +255,8 @@ def __init__(
             dequantized_weight = decompressed_weight.to(torch.float32) * \
                 torch.unsqueeze(scale.to(torch.float32), dim=1)
             if qtype == "asym_int4_rtn" and min is not None:
-                dequantized_weight = dequantized_weight + torch.unsqueeze(min.to(torch.float32), dim=1)
+                dequantized_weight = dequantized_weight + torch.unsqueeze(min.to(torch.float32),
+                                                                          dim=1)
             self.weight = Parameter(dequantized_weight, requires_grad=False).contiguous()
         else:
             dequantized_weight = weight.to(torch.float32) * \

From 21d9811e6c92053dec82dc0fe16a358b09a94d29 Mon Sep 17 00:00:00 2001
From: rnwang04 <ruonan1.wang@intel.com>
Date: Wed, 4 Dec 2024 09:43:55 +0800
Subject: [PATCH 04/21] update

---
 python/llm/src/ipex_llm/transformers/npu_model.py    |  2 +-
 .../src/ipex_llm/transformers/npu_models/linear.py   | 12 +++++++-----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py
index f1e14a78f89..21afc14ca90 100644
--- a/python/llm/src/ipex_llm/transformers/npu_model.py
+++ b/python/llm/src/ipex_llm/transformers/npu_model.py
@@ -221,7 +221,7 @@ def from_pretrained(cls, *args, **kwargs):
                 model = cls.optimize_npu_model(*args, **optimize_kwargs)
             else:
                 from ipex_llm.transformers.npu_models.convert import optimize_llm
-                optimize_llm(model)
+                # optimize_llm(model)
                 with torch.no_grad():
                     cls.load_convert(qtype, model, "cpu", modules_to_not_convert,
                                      quantization_group_size, imatrix_data,
diff --git a/python/llm/src/ipex_llm/transformers/npu_models/linear.py b/python/llm/src/ipex_llm/transformers/npu_models/linear.py
index 0b636b098d6..ff9eb5e31b3 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/linear.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/linear.py
@@ -201,7 +201,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 )
             )
 
-        out = run_matmul(x, self.weight.data, self.scale.data, self.op_id)
+        min_data = self.min.data if self.min is not None else None
+        out = run_matmul(x, self.weight.data, self.scale.data, min_data, self.op_id)
 
         if self.qtype == "asym_int4_rtn" and self.min is not None:
             out = out + self.min
@@ -245,15 +246,16 @@ def __init__(
             )
 
         if weight.dtype == torch.uint8:
-            if qtype == "sym_int4_rtn":
-                weight = weight.view(torch.int8)
+            weight = weight.view(torch.int8)
             high_4bits = weight >> 4
             low_4bits = (weight << 4) >> 4
 
             combined_weight = torch.cat((low_4bits.unsqueeze(2), high_4bits.unsqueeze(2)), dim=2)
             decompressed_weight = combined_weight.view(combined_weight.size(0), -1)
-            dequantized_weight = decompressed_weight.to(torch.float32) * \
-                torch.unsqueeze(scale.to(torch.float32), dim=1)
+            if qtype == "asym_int4_rtn":
+                decompressed_weight = decompressed_weight + 8
+            dequantized_weight = decompressed_weight.to(torch.float32)
+            dequantized_weight = dequantized_weight * torch.unsqueeze(scale.to(torch.float32), dim=1)
             if qtype == "asym_int4_rtn" and min is not None:
                 dequantized_weight = dequantized_weight + torch.unsqueeze(min.to(torch.float32),
                                                                           dim=1)

From 23f902312a73e3ac3565a8c93e2a7de2caa5500a Mon Sep 17 00:00:00 2001
From: rnwang04 <ruonan1.wang@intel.com>
Date: Wed, 4 Dec 2024 11:44:27 +0800
Subject: [PATCH 05/21] update min to Z1

---
 python/llm/src/ipex_llm/transformers/npu_models/linear.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/llm/src/ipex_llm/transformers/npu_models/linear.py b/python/llm/src/ipex_llm/transformers/npu_models/linear.py
index ff9eb5e31b3..22d64b005ca 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/linear.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/linear.py
@@ -252,8 +252,6 @@ def __init__(
 
             combined_weight = torch.cat((low_4bits.unsqueeze(2), high_4bits.unsqueeze(2)), dim=2)
             decompressed_weight = combined_weight.view(combined_weight.size(0), -1)
-            if qtype == "asym_int4_rtn":
-                decompressed_weight = decompressed_weight + 8
             dequantized_weight = decompressed_weight.to(torch.float32)
             dequantized_weight = dequantized_weight * torch.unsqueeze(scale.to(torch.float32), dim=1)
             if qtype == "asym_int4_rtn" and min is not None:

From 293fb8386a89a3d766829f49dfaf651f48dab419 Mon Sep 17 00:00:00 2001
From: rnwang04 <ruonan1.wang@intel.com>
Date: Wed, 4 Dec 2024 14:16:46 +0800
Subject: [PATCH 06/21] update

---
 .../llm/src/ipex_llm/transformers/npu_models/convert.py  | 5 ++---
 .../llm/src/ipex_llm/transformers/npu_models/linear.py   | 9 +++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert.py b/python/llm/src/ipex_llm/transformers/npu_models/convert.py
index 8fd0704b194..cc641fbc3b2 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/convert.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/convert.py
@@ -88,10 +88,9 @@ def replace_with_QuantizedLinear(layer, qtype, device, modules_to_not_convert,
     from ipex_llm.ggml.quantize import ggml_tensor_qtype
     iqtype = ggml_tensor_qtype[qtype]
     if isinstance(layer, torch.nn.Linear) and not hasattr(layer, "qtype"):
-        if qtype == "sym_int4_rtn":
+        if qtype in ["sym_int4_rtn", "asym_int4_rtn"]:
             # workaround for qwen2-7B & int4
-            if (layer.in_features == 3584 and layer.out_features == 152064) or \
-               (layer.in_features == 18944 and layer.out_features == 3584):
+            if (layer.in_features == 3584 and layer.out_features == 152064):
                 qtype = "sym_int8_rtn"
                 iqtype = ggml_tensor_qtype[qtype]
         enable_scale_search = os.environ.get("IPEX_LLM_NPU_QUANTIZATION_OPT", "0") != "0"
diff --git a/python/llm/src/ipex_llm/transformers/npu_models/linear.py b/python/llm/src/ipex_llm/transformers/npu_models/linear.py
index 22d64b005ca..eaa1cd7c718 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/linear.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/linear.py
@@ -166,8 +166,11 @@ def __init__(
                 # Int4 we need to double the input channels because weights are compressed
                 self.inC *= 2
             self.scale = Parameter(scale * math.sqrt(self.inC), requires_grad=False)
+            if min is not None:
+                self.min = Parameter(min * math.sqrt(self.inC), requires_grad=False)
+            else:
+                self.min = None
         self.bias = bias
-        self.min = min
         self.qtype = qtype
         self.op_id = str(uuid.uuid4())
 
@@ -202,11 +205,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             )
 
         min_data = self.min.data if self.min is not None else None
+        print("min is None:", min is None)
         out = run_matmul(x, self.weight.data, self.scale.data, min_data, self.op_id)
 
-        if self.qtype == "asym_int4_rtn" and self.min is not None:
-            out = out + self.min
-
         if self.bias is None:
             return out
         return out + self.bias

From 3e799d59eccc76870fe447f49e5342d9fbd9c14c Mon Sep 17 00:00:00 2001
From: rnwang04 <ruonan1.wang@intel.com>
Date: Wed, 4 Dec 2024 15:15:39 +0800
Subject: [PATCH 07/21] fix

---
 python/llm/src/ipex_llm/transformers/npu_models/linear.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/llm/src/ipex_llm/transformers/npu_models/linear.py b/python/llm/src/ipex_llm/transformers/npu_models/linear.py
index eaa1cd7c718..bbb006aa48d 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/linear.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/linear.py
@@ -205,7 +205,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             )
 
         min_data = self.min.data if self.min is not None else None
-        print("min is None:", min is None)
         out = run_matmul(x, self.weight.data, self.scale.data, min_data, self.op_id)
 
         if self.bias is None:

From 17fbe84944eafd8b52e652bb676af51c8f2427aa Mon Sep 17 00:00:00 2001
From: rnwang04 <ruonan1.wang@intel.com>
Date: Wed, 4 Dec 2024 15:44:25 +0800
Subject: [PATCH 08/21] update

---
 python/llm/src/ipex_llm/transformers/npu_model.py         | 7 ++++++-
 python/llm/src/ipex_llm/transformers/npu_models/linear.py | 4 ++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py
index 21afc14ca90..4bdc5eb4aab 100644
--- a/python/llm/src/ipex_llm/transformers/npu_model.py
+++ b/python/llm/src/ipex_llm/transformers/npu_model.py
@@ -155,6 +155,11 @@ def from_pretrained(cls, *args, **kwargs):
                 f"but got {quantization_group_size}"
             )
         )
+        
+        if low_bit == "asym_int4":
+            invalidInputError(quantization_group_size > 0,
+                              "asym_int4 only support quantization_group_size == 0 for now.")
+
         _args = copy.deepcopy(args)
         _kwargs = copy.deepcopy(kwargs)
 
@@ -221,7 +226,7 @@ def from_pretrained(cls, *args, **kwargs):
                 model = cls.optimize_npu_model(*args, **optimize_kwargs)
             else:
                 from ipex_llm.transformers.npu_models.convert import optimize_llm
-                # optimize_llm(model)
+                optimize_llm(model)
                 with torch.no_grad():
                     cls.load_convert(qtype, model, "cpu", modules_to_not_convert,
                                      quantization_group_size, imatrix_data,
diff --git a/python/llm/src/ipex_llm/transformers/npu_models/linear.py b/python/llm/src/ipex_llm/transformers/npu_models/linear.py
index bbb006aa48d..4decce2893b 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/linear.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/linear.py
@@ -252,8 +252,8 @@ def __init__(
 
             combined_weight = torch.cat((low_4bits.unsqueeze(2), high_4bits.unsqueeze(2)), dim=2)
             decompressed_weight = combined_weight.view(combined_weight.size(0), -1)
-            dequantized_weight = decompressed_weight.to(torch.float32)
-            dequantized_weight = dequantized_weight * torch.unsqueeze(scale.to(torch.float32), dim=1)
+            dequantized_weight = decompressed_weight.to(torch.float32) * \
+                torch.unsqueeze(scale.to(torch.float32), dim=1)
             if qtype == "asym_int4_rtn" and min is not None:
                 dequantized_weight = dequantized_weight + torch.unsqueeze(min.to(torch.float32),
                                                                           dim=1)

From c5e14ad352f8e1be34a6d3fc4396dc5689a1dc7f Mon Sep 17 00:00:00 2001
From: rnwang04 <ruonan1.wang@intel.com>
Date: Wed, 4 Dec 2024 15:45:51 +0800
Subject: [PATCH 09/21] fix style

---
 python/llm/src/ipex_llm/transformers/npu_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py
index 4bdc5eb4aab..b08b0352a98 100644
--- a/python/llm/src/ipex_llm/transformers/npu_model.py
+++ b/python/llm/src/ipex_llm/transformers/npu_model.py
@@ -155,7 +155,7 @@ def from_pretrained(cls, *args, **kwargs):
                 f"but got {quantization_group_size}"
             )
         )
-        
+
         if low_bit == "asym_int4":
             invalidInputError(quantization_group_size > 0,
                               "asym_int4 only support quantization_group_size == 0 for now.")

From 59dfc424b03a15437154835a5f0b9d2134a402ae Mon Sep 17 00:00:00 2001
From: rnwang04 <ruonan1.wang@intel.com>
Date: Wed, 4 Dec 2024 16:01:37 +0800
Subject: [PATCH 10/21] fix

---
 python/llm/src/ipex_llm/transformers/npu_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py
index b08b0352a98..2a56de7df6d 100644
--- a/python/llm/src/ipex_llm/transformers/npu_model.py
+++ b/python/llm/src/ipex_llm/transformers/npu_model.py
@@ -157,7 +157,7 @@ def from_pretrained(cls, *args, **kwargs):
         )
 
         if low_bit == "asym_int4":
-            invalidInputError(quantization_group_size > 0,
+            invalidInputError(quantization_group_size == 0,
                               "asym_int4 only support quantization_group_size == 0 for now.")
 
         _args = copy.deepcopy(args)

From 1cc8b96a6fbaa270fdc273be828fb30896134861 Mon Sep 17 00:00:00 2001
From: rnwang04 <ruonan1.wang@intel.com>
Date: Wed, 4 Dec 2024 18:07:27 +0800
Subject: [PATCH 11/21] support qwen2 optimize_model=True mp version

---
 .../src/ipex_llm/transformers/npu_model.py    | 51 ++++++++++---------
 .../transformers/npu_models/convert_mp.py     |  2 +-
 .../transformers/npu_models/mp_models_base.py | 42 ++++++++++-----
 .../transformers/npu_models/qwen2_mp.py       | 44 ++++++++++++----
 4 files changed, 91 insertions(+), 48 deletions(-)

diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py
index 2a56de7df6d..7f86f6e45d5 100644
--- a/python/llm/src/ipex_llm/transformers/npu_model.py
+++ b/python/llm/src/ipex_llm/transformers/npu_model.py
@@ -276,6 +276,7 @@ def optimize_npu_model(cls, *args, **kwargs):
         with torch.no_grad():
             model.config.update({"mixed_precision": mixed_precision})
             model.config.update({"group_size": quantization_group_size})
+            model.config.update({"asym": qtype == "asym_int4_rtn"})
             optimize_llm_pre(model, qtype, mixed_precision,
                              quantization_group_size=quantization_group_size)
             cls.load_convert(qtype, model, "cpu", modules_to_not_convert,
@@ -288,29 +289,29 @@ def optimize_npu_model(cls, *args, **kwargs):
         model.share_memory()
 
         if not pipeline:
-            if (not hasattr(model, 'llm') and
-                    model.config.model_type in ["qwen2", "llama", "minicpm"]):
-                from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
-                optimize_llm_single_process(
-                    llm,
-                    kv_len=max_context_len,
-                    max_prompt_len=max_prompt_len,
-                    transpose_value_cache=transpose_value_cache,
-                    group_size=quantization_group_size,
-                    qtype=qtype,
-                    save_directory=save_directory,
-                    fuse_layers=fuse_layers
-                )
-            else:
-                optimize_llm(
-                    llm,
-                    max_context_len=max_context_len,
-                    max_prompt_len=max_prompt_len,
-                    inter_pp=inter_pp,
-                    intra_pp=intra_pp,
-                    transpose_value_cache=transpose_value_cache,
-                    group_size=quantization_group_size
-                )
+            # if (not hasattr(model, 'llm') and
+            #         model.config.model_type in ["qwen2", "llama", "minicpm"]):
+            #     from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
+            #     optimize_llm_single_process(
+            #         llm,
+            #         kv_len=max_context_len,
+            #         max_prompt_len=max_prompt_len,
+            #         transpose_value_cache=transpose_value_cache,
+            #         group_size=quantization_group_size,
+            #         qtype=qtype,
+            #         save_directory=save_directory,
+            #         fuse_layers=fuse_layers
+            #     )
+            # else:
+            optimize_llm(
+                llm,
+                max_context_len=max_context_len,
+                max_prompt_len=max_prompt_len,
+                inter_pp=inter_pp,
+                intra_pp=intra_pp,
+                transpose_value_cache=transpose_value_cache,
+                group_size=quantization_group_size
+            )
         else:
             from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
                 import convert_llm
@@ -422,9 +423,9 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs)
         )
 
         invalidInputError(
-            qtype in ["sym_int8_rtn", "sym_int4_rtn"],
+            qtype in ["sym_int8_rtn", "sym_int4_rtn", "asym_int4_rtn"],
             f"Unknown bigdl_transformers_low_bit value: {qtype},"
-            f" expected: sym_int8_rtn, sym_int4_rtn. "
+            f" expected: sym_int8_rtn, sym_int4_rtn, asym_int4_rtn. "
         )
 
         if enable_cpp_backend:
diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py
index 39c9cd00fe6..0d1089963c9 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py
@@ -128,7 +128,7 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision,
         from ipex_llm.transformers.npu_models.common import split_linears
         if quantization_group_size == 0:
             n_splits_linear = 1
-            if qtype == "sym_int8_rtn":
+            if qtype in ["sym_int8_rtn", "asym_int4_rtn"]:
                 # do not split mlp down_proj for Qwen2-7B & sym_int8
                 n_splits_down_proj = 1
             else:
diff --git a/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py b/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py
index ccf6e242d90..59cc2b1920e 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py
@@ -59,9 +59,15 @@ def run_model(
     op_args_flatten = []
     for w in weights:
         if isinstance(w, tuple):  # from QuantizedLinear
-            op_args.append((set_contiguous(w[0]).numpy(), set_contiguous(w[1]).numpy()))
-            op_args_flatten.append(op_args[-1][0])
-            op_args_flatten.append(op_args[-1][1])
+            if len(w) == 2:
+                op_args.append((set_contiguous(w[0]).numpy(), set_contiguous(w[1]).numpy()))
+                op_args_flatten.append(op_args[-1][0])
+                op_args_flatten.append(op_args[-1][1])
+            else:
+                op_args.append((set_contiguous(w[0]).numpy(), set_contiguous(w[1]).numpy(), set_contiguous(w[2]).numpy()))
+                op_args_flatten.append(op_args[-1][0])
+                op_args_flatten.append(op_args[-1][1])
+                op_args_flatten.append(op_args[-1][2])
         elif w.dtype in [torch.int8, torch.uint8]:    # QuantizedLinear weight
             op_args.append(w.numpy())
             op_args_flatten.append(op_args[-1])
@@ -104,7 +110,7 @@ def run_model(
 class LLMBaseNNFactory(NNFactory):
 
     def __init__(self, max_seq_len, transpose_value, dtype, profile=False, device="NPU",
-                 n_splits_linear=1, n_splits_down_proj=1, group_size=0):
+                 n_splits_linear=1, n_splits_down_proj=1, group_size=0, asym=False):
         super().__init__(profile, device)
         self.cache_parameter_ops = []
         self.input_ops = []
@@ -117,6 +123,7 @@ def __init__(self, max_seq_len, transpose_value, dtype, profile=False, device="N
         self.n_splits_linear = n_splits_linear
         self.n_splits_down_proj = n_splits_down_proj
         self.group_size = group_size
+        self.asym = asym
 
     def attention(self,
                   *,
@@ -149,7 +156,8 @@ def attention(self,
             wt_dtype=self.dtype,
             n_splits=self.n_splits_linear,
             scale_factor=(self.group_size == 0),
-            is_prefill=(mode == "prefill")
+            is_prefill=(mode == "prefill"),
+            asym=self.asym
         )
 
         key_states = self.linear(
@@ -160,7 +168,8 @@ def attention(self,
             wt_dtype=self.dtype,
             n_splits=self.n_splits_linear,
             scale_factor=(self.group_size == 0),
-            is_prefill=(mode == "prefill")
+            is_prefill=(mode == "prefill"),
+            asym=self.asym
         )
 
         value_states = self.linear(
@@ -171,7 +180,8 @@ def attention(self,
             wt_dtype=self.dtype,
             n_splits=self.n_splits_linear,
             scale_factor=(self.group_size == 0),
-            is_prefill=(mode == "prefill")
+            is_prefill=(mode == "prefill"),
+            asym=self.asym
         )
 
         if q_bias is not None:
@@ -260,7 +270,8 @@ def attention(self,
             attn_output, hidden_size, hidden_size, bias=False, wt_dtype=self.dtype,
             n_splits=self.n_splits_linear,
             scale_factor=(self.group_size == 0),
-            is_prefill=(mode == "prefill")
+            is_prefill=(mode == "prefill"),
+            asym=self.asym
         )
         return attn_output, new_key_states, new_value_states
 
@@ -428,13 +439,15 @@ def mlp(self, hidden_states, seq_len=-1, mode="prefill"):
             hidden_states, self.intermediate_size, self.hidden_size, bias=False,
             wt_dtype=self.dtype, n_splits=self.n_splits_linear,
             scale_factor=(self.group_size == 0),
-            is_prefill=(mode == "prefill")
+            is_prefill=(mode == "prefill"),
+            asym=self.asym
         )
         mm2 = self.linear(
             hidden_states, self.intermediate_size, self.hidden_size, bias=False,
             wt_dtype=self.dtype, n_splits=self.n_splits_linear,
             scale_factor=(self.group_size == 0),
-            is_prefill=(mode == "prefill")
+            is_prefill=(mode == "prefill"),
+            asym=self.asym
         )  # type: ignore[attr-defined]
         mm1 = self.eltwise_mul(self.swish(mm1), mm2)  # type: ignore[attr-defined]
 
@@ -442,7 +455,8 @@ def mlp(self, hidden_states, seq_len=-1, mode="prefill"):
             mm1, self.hidden_size, self.intermediate_size, bias=False, wt_dtype=self.dtype,
             n_splits=self.n_splits_down_proj,
             scale_factor=(self.group_size == 0),
-            is_prefill=(mode == "prefill")
+            is_prefill=(mode == "prefill"),
+            asym=self.asym
         )
         return hidden_states
 
@@ -558,11 +572,13 @@ def linear(self,
                wt_dtype: npt.DTypeLike = np.float16,
                n_splits: int = 1,
                scale_factor: bool = True,
-               is_prefill: bool = False):
+               is_prefill: bool = False,
+               asym: bool = False):
         if n_splits == 1:
             op = super().linear(input_node, output_channels,
                                 input_channels, bias, act_dtype,
-                                wt_dtype, scale_factor=scale_factor)
+                                wt_dtype, scale_factor=scale_factor,
+                                asym=asym)
         else:
             op = super().dq_split_linear(input_node, n_splits,
                                          output_channels, input_channels,
diff --git a/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py
index 015efe10031..d32f1751a98 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py
@@ -97,7 +97,8 @@ def __init__(
         intermediate_size,
         n_splits_linear: int = 1,
         n_splits_down_proj: int = 1,
-        group_size: int = 0
+        group_size: int = 0,
+        asym: bool = False,
     ):
         super().__init__(max_seq_len=max_seq_len,
                          transpose_value=transpose_value,
@@ -106,7 +107,8 @@ def __init__(
                          device=device,
                          n_splits_linear=n_splits_linear,
                          n_splits_down_proj=n_splits_down_proj,
-                         group_size=group_size)
+                         group_size=group_size,
+                         asym=asym)
         self.max_seq_len = max_seq_len
         self.intermediate_size = intermediate_size
         self.dtype = dtype
@@ -311,6 +313,7 @@ def __init__(
         n_splits_linear: int = 1,
         n_splits_down_proj: int = 1,
         group_size: int = 0,
+        asym: bool = False,
     ):
         super().__init__()
 
@@ -318,8 +321,10 @@ def __init__(
 
         op_parameters = []
         for w in parameters:
-            if isinstance(w, tuple):  # from QuantizedLinear
+            if isinstance(w, tuple) and not asym:  # from QuantizedLinear
                 op_parameters.append((w[0].numpy(), w[1].numpy()))
+            elif isinstance(w, tuple) and asym:  # from QuantizedLinear
+                op_parameters.append((w[0].numpy(), w[1].numpy(),  w[2].numpy()))
             elif w.dtype in [torch.int8, torch.uint8]:    # QuantizedLinear weight
                 op_parameters.append(w.numpy())
             elif isinstance(w, np.ndarray):     # scale
@@ -375,7 +380,8 @@ def __init__(
                 dtype=np_dtype,
                 n_splits_linear=n_splits_linear,
                 n_splits_down_proj=n_splits_down_proj,
-                group_size=group_size
+                group_size=group_size,
+                asym=asym,
             )
             self.backend_decoders.append(decoder)
 
@@ -461,6 +467,7 @@ def __init__(
         n_splits_linear: int = 1,
         n_splits_down_proj: int = 1,
         group_size: int = 0,
+        asym: bool = False,
     ):
         super().__init__()
         self.op_parameters = parameters
@@ -491,7 +498,8 @@ def __init__(
             dtype=np_dtype,
             n_splits_linear=n_splits_linear,
             n_splits_down_proj=n_splits_down_proj,
-            group_size=group_size
+            group_size=group_size,
+            asym=asym
         )
         self.layer_norm_0 = layer_norm_0
         self.layer_norm_1 = layer_norm_1
@@ -580,6 +588,7 @@ def run_decode(
     layer_indexs = range(layer_start, layer_end)
     n_splits_linear = len(model.model.layers[0].mlp.gate_proj_dq_list)
     n_splits_down_proj = len(model.model.layers[0].mlp.down_proj_dq_list)
+    asym = getattr(model.config, "asym", False)
     for layer_idx in layer_indexs:
         curr_layer = model.model.layers[layer_idx]
         attn_layer = curr_layer.self_attn
@@ -592,10 +601,17 @@ def run_decode(
                            mlp_layer.down_proj_dq_list]:
             l_weights = []
             scales = []
+            mins = []
             for l in layer_list:
                 l_weights.append(l.weight)
                 scales.append(l.scale)
-            weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0)))
+                if l.min is not None:
+                    mins.append(l.min)
+            if len(mins):
+                weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0),
+                                torch.stack(mins, axis=0)))
+            else:
+                weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0)))
 
         cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
         cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
@@ -630,7 +646,8 @@ def run_decode(
         do_print=False,
         n_splits_linear=n_splits_linear,
         n_splits_down_proj=n_splits_down_proj,
-        group_size=group_size
+        group_size=group_size,
+        asym=asym
     )
 
     dist.barrier()
@@ -809,6 +826,7 @@ def run_prefill(
     layer_indexs = range(layer_start, layer_end)
     n_splits_linear = len(model.model.layers[0].mlp.gate_proj_dq_list)
     n_splits_down_proj = len(model.model.layers[0].mlp.down_proj_dq_list)
+    asym = getattr(model.config, "asym", False)
     for layer_idx in layer_indexs:
         curr_layer = model.model.layers[layer_idx]
         attn_layer = curr_layer.self_attn
@@ -821,10 +839,17 @@ def run_prefill(
                            mlp_layer.down_proj_dq_list]:
             l_weights = []
             scales = []
+            mins = []
             for l in layer_list:
                 l_weights.append(l.weight)
                 scales.append(l.scale)
-            weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0)))
+                if l.min is not None:
+                    mins.append(l.min)
+            if len(mins):
+                weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0),
+                                torch.stack(mins, axis=0)))
+            else:
+                weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0)))
 
         cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
         cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
@@ -850,7 +875,8 @@ def run_prefill(
             transpose_value=transpose_value_cache,
             n_splits_linear=n_splits_linear,
             n_splits_down_proj=n_splits_down_proj,
-            group_size=group_size
+            group_size=group_size,
+            asym=asym
         )
 
         layer_weights.extend(weights)

From 5f02aacb6146f836f62a663bd73af4e070bbc566 Mon Sep 17 00:00:00 2001
From: rnwang04 <ruonan1.wang@intel.com>
Date: Wed, 4 Dec 2024 18:57:12 +0800
Subject: [PATCH 12/21] temp save

---
 .../src/ipex_llm/transformers/npu_model.py    | 46 ++++++------
 .../npu_pipeline_model/convert_pipeline.py    |  6 +-
 .../transformers/npu_pipeline_model/qwen.py   | 75 ++++++++++++++-----
 3 files changed, 85 insertions(+), 42 deletions(-)

diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py
index 7f86f6e45d5..657daa0aa3e 100644
--- a/python/llm/src/ipex_llm/transformers/npu_model.py
+++ b/python/llm/src/ipex_llm/transformers/npu_model.py
@@ -289,29 +289,29 @@ def optimize_npu_model(cls, *args, **kwargs):
         model.share_memory()
 
         if not pipeline:
-            # if (not hasattr(model, 'llm') and
-            #         model.config.model_type in ["qwen2", "llama", "minicpm"]):
-            #     from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
-            #     optimize_llm_single_process(
-            #         llm,
-            #         kv_len=max_context_len,
-            #         max_prompt_len=max_prompt_len,
-            #         transpose_value_cache=transpose_value_cache,
-            #         group_size=quantization_group_size,
-            #         qtype=qtype,
-            #         save_directory=save_directory,
-            #         fuse_layers=fuse_layers
-            #     )
-            # else:
-            optimize_llm(
-                llm,
-                max_context_len=max_context_len,
-                max_prompt_len=max_prompt_len,
-                inter_pp=inter_pp,
-                intra_pp=intra_pp,
-                transpose_value_cache=transpose_value_cache,
-                group_size=quantization_group_size
-            )
+            if (not hasattr(model, 'llm') and
+                    model.config.model_type in ["qwen2", "llama", "minicpm"]):
+                from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
+                optimize_llm_single_process(
+                    llm,
+                    kv_len=max_context_len,
+                    max_prompt_len=max_prompt_len,
+                    transpose_value_cache=transpose_value_cache,
+                    group_size=quantization_group_size,
+                    qtype=qtype,
+                    save_directory=save_directory,
+                    fuse_layers=fuse_layers
+                )
+            else:
+                optimize_llm(
+                    llm,
+                    max_context_len=max_context_len,
+                    max_prompt_len=max_prompt_len,
+                    inter_pp=inter_pp,
+                    intra_pp=intra_pp,
+                    transpose_value_cache=transpose_value_cache,
+                    group_size=quantization_group_size
+                )
         else:
             from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
                 import convert_llm
diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py
index 2e6b249c1a5..5304d124539 100644
--- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py
+++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py
@@ -201,7 +201,7 @@ def convert_llm(model: torch.nn.Module,
     layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
     if group_size == 0:
         n_splits_linear = 1
-        if qtype == "sym_int8_rtn":
+        if qtype in ["sym_int8_rtn", "asym_int4_rtn"]:
             # do not split mlp down_proj for Qwen2-7B & sym_int8
             n_splits_down_proj = 1
         else:
@@ -433,6 +433,7 @@ def convert_llm_for_deploy(model: torch.nn.Module,
     if not os.path.exists(weight_dir):
         os.mkdir(weight_dir)
     layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
+    asym = getattr(model.config, "asym", False)
 
     if model.config.model_type == "qwen2":
         if group_size == 0:
@@ -456,7 +457,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
                        "weight_num": 7,
                        "weight_idx": 8,
                        "n_splits_linear": n_splits_linear,
-                       "n_splits_down_proj": n_splits_down_proj}
+                       "n_splits_down_proj": n_splits_down_proj,
+                       "asym": asym}
         model.config.update(update_dict)
         model.config.save_pretrained(save_directory)
 
diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py
index e4b318244ce..40d2ebf135b 100644
--- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py
+++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py
@@ -104,6 +104,7 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
     head_dim = model.model.layers[0].self_attn.head_dim
     intermediate_size = model.config.intermediate_size
     rms_norm_eps = model.config.rms_norm_eps
+    asym = getattr(model.config, "asym", False)
 
     from ipex_llm.transformers.npu_models.qwen2_mp import LowBitQwenMultiDecoderlayer
     curr_layer = model.model.layers[layer_idx]
@@ -117,10 +118,17 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
                        mlp_layer.down_proj_dq_list]:
         l_weights = []
         scales = []
+        mins = []
         for l in layer_list:
             l_weights.append(l.weight)
             scales.append(l.scale)
-        weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0)))
+            if l.min is not None:
+                mins.append(l.min)
+        if len(mins):
+            weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0),
+                            torch.stack(mins, axis=0)))
+        else:
+            weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0)))
 
     q_bias = attn_layer.q_proj_dq_list.q_proj_dq_0.bias.to(torch.float16)
     k_bias = attn_layer.k_proj_dq_list.k_proj_dq_0.bias.to(torch.float16)
@@ -164,7 +172,8 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
         dtype=np_dtype,
         n_splits_linear=n_splits_linear,
         n_splits_down_proj=n_splits_down_proj,
-        group_size=group_size
+        group_size=group_size,
+        asym=asym
     )
     rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
                                                         decoder_name,
@@ -188,11 +197,23 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
         k_bias.data.numpy().tofile(k_bias_bin_file)
         v_bias.data.numpy().tofile(v_bias_bin_file)
         # 6, 7 are past k/v
-        for idx, (weight, scale) in enumerate(weights):
-            bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+5+idx*2}.bin")
-            weight.numpy().tofile(bin_file)
-            bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+5+idx*2+1}.bin")
-            scale.numpy().tofile(bin_file)
+        if not asym:
+            for idx, (weight, scale) in enumerate(weights):
+                bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*2}.bin")
+                weight.numpy().tofile(bin_file)
+                bin_file = os.path.join(weight_dir,
+                                        f"model_{layer_idx}_input_{st_idx+3+idx*2+1}.bin")
+                scale.numpy().tofile(bin_file)
+        else:
+            for idx, (weight, scale, min) in enumerate(weights):
+                bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*3}.bin")
+                weight.numpy().tofile(bin_file)
+                bin_file = os.path.join(weight_dir,
+                                        f"model_{layer_idx}_input_{st_idx+3+idx*3+1}.bin")
+                scale.numpy().tofile(bin_file)
+                bin_file = os.path.join(weight_dir,
+                                        f"model_{layer_idx}_input_{st_idx+3+idx*3+2}.bin")
+                min.numpy().tofile(bin_file)
 
     del single_decoder
 
@@ -207,6 +228,7 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
     rms_norm_eps = model.config.rms_norm_eps
     layer_num = len(model.model.layers)
     fused_layer_num = layer_num // fused_layers
+    asym = getattr(model.config, "asym", False)
 
     from ipex_llm.transformers.npu_models.qwen2_mp import LowBitQwenMultiDecoderlayer
     for i in range(fused_layers):
@@ -228,15 +250,22 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
 
             weights = []
             for layer_list in [attn_layer.q_proj_dq_list, attn_layer.k_proj_dq_list,
-                               attn_layer.v_proj_dq_list, attn_layer.o_proj_dq_list,
-                               mlp_layer.gate_proj_dq_list, mlp_layer.up_proj_dq_list,
-                               mlp_layer.down_proj_dq_list]:
+                            attn_layer.v_proj_dq_list, attn_layer.o_proj_dq_list,
+                            mlp_layer.gate_proj_dq_list, mlp_layer.up_proj_dq_list,
+                            mlp_layer.down_proj_dq_list]:
                 l_weights = []
                 scales = []
+                mins = []
                 for l in layer_list:
                     l_weights.append(l.weight)
                     scales.append(l.scale)
-                weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0)))
+                    if l.min is not None:
+                        mins.append(l.min)
+                if len(mins):
+                    weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0),
+                                    torch.stack(mins, axis=0)))
+                else:
+                    weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0)))
 
             cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
             cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
@@ -264,12 +293,23 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
             k_biases[-1].data.numpy().tofile(k_bias_bin_file)
             v_biases[-1].data.numpy().tofile(v_bias_bin_file)
             # 6, 7 are past k/v
-            for idx, (weight, scale) in enumerate(weights):
-                bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*2}.bin")
-                weight.numpy().tofile(bin_file)
-                bin_file = os.path.join(weight_dir,
-                                        f"model_{layer_idx}_input_{st_idx+3+idx*2+1}.bin")
-                scale.numpy().tofile(bin_file)
+            if not asym:
+                for idx, (weight, scale) in enumerate(weights):
+                    bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*2}.bin")
+                    weight.numpy().tofile(bin_file)
+                    bin_file = os.path.join(weight_dir,
+                                            f"model_{layer_idx}_input_{st_idx+3+idx*2+1}.bin")
+                    scale.numpy().tofile(bin_file)
+            else:
+                for idx, (weight, scale, min) in enumerate(weights):
+                    bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*3}.bin")
+                    weight.numpy().tofile(bin_file)
+                    bin_file = os.path.join(weight_dir,
+                                            f"model_{layer_idx}_input_{st_idx+3+idx*3+1}.bin")
+                    scale.numpy().tofile(bin_file)
+                    bin_file = os.path.join(weight_dir,
+                                            f"model_{layer_idx}_input_{st_idx+3+idx*3+2}.bin")
+                    min.numpy().tofile(bin_file)
 
         if isinstance(weights[0], tuple):
             np_dtype = np.int8 if weights[0][0].dtype == torch.int8 else np.uint8
@@ -297,6 +337,7 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
             n_splits_linear=n_splits_linear,
             n_splits_down_proj=n_splits_down_proj,
             group_size=group_size
+            asym=asym
         )
         update_names_of_IR_and_export_blob(fused_decoder,
                                            f"decoder_layer_{i}",

From dc1ec36e425482946f13cb6a33bcb41869d5519d Mon Sep 17 00:00:00 2001
From: rnwang04 <ruonan1.wang@intel.com>
Date: Wed, 4 Dec 2024 19:46:59 +0800
Subject: [PATCH 13/21] fix

---
 .../npu_pipeline_model/convert_pipeline.py             |  4 +---
 .../ipex_llm/transformers/npu_pipeline_model/qwen.py   | 10 +++++-----
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py
index 5304d124539..ee773614b63 100644
--- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py
+++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py
@@ -433,7 +433,6 @@ def convert_llm_for_deploy(model: torch.nn.Module,
     if not os.path.exists(weight_dir):
         os.mkdir(weight_dir)
     layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
-    asym = getattr(model.config, "asym", False)
 
     if model.config.model_type == "qwen2":
         if group_size == 0:
@@ -457,8 +456,7 @@ def convert_llm_for_deploy(model: torch.nn.Module,
                        "weight_num": 7,
                        "weight_idx": 8,
                        "n_splits_linear": n_splits_linear,
-                       "n_splits_down_proj": n_splits_down_proj,
-                       "asym": asym}
+                       "n_splits_down_proj": n_splits_down_proj}
         model.config.update(update_dict)
         model.config.save_pretrained(save_directory)
 
diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py
index 40d2ebf135b..40443ace85d 100644
--- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py
+++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py
@@ -205,7 +205,7 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
                                         f"model_{layer_idx}_input_{st_idx+3+idx*2+1}.bin")
                 scale.numpy().tofile(bin_file)
         else:
-            for idx, (weight, scale, min) in enumerate(weights):
+            for idx, (weight, scale, m) in enumerate(weights):
                 bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*3}.bin")
                 weight.numpy().tofile(bin_file)
                 bin_file = os.path.join(weight_dir,
@@ -213,7 +213,7 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
                 scale.numpy().tofile(bin_file)
                 bin_file = os.path.join(weight_dir,
                                         f"model_{layer_idx}_input_{st_idx+3+idx*3+2}.bin")
-                min.numpy().tofile(bin_file)
+                m.numpy().tofile(bin_file)
 
     del single_decoder
 
@@ -301,7 +301,7 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
                                             f"model_{layer_idx}_input_{st_idx+3+idx*2+1}.bin")
                     scale.numpy().tofile(bin_file)
             else:
-                for idx, (weight, scale, min) in enumerate(weights):
+                for idx, (weight, scale, m) in enumerate(weights):
                     bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*3}.bin")
                     weight.numpy().tofile(bin_file)
                     bin_file = os.path.join(weight_dir,
@@ -309,7 +309,7 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
                     scale.numpy().tofile(bin_file)
                     bin_file = os.path.join(weight_dir,
                                             f"model_{layer_idx}_input_{st_idx+3+idx*3+2}.bin")
-                    min.numpy().tofile(bin_file)
+                    m.numpy().tofile(bin_file)
 
         if isinstance(weights[0], tuple):
             np_dtype = np.int8 if weights[0][0].dtype == torch.int8 else np.uint8
@@ -336,7 +336,7 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
             dtype=np_dtype,
             n_splits_linear=n_splits_linear,
             n_splits_down_proj=n_splits_down_proj,
-            group_size=group_size
+            group_size=group_size,
             asym=asym
         )
         update_names_of_IR_and_export_blob(fused_decoder,

From f54afdc41b72633576162775603387d2b43580a8 Mon Sep 17 00:00:00 2001
From: rnwang04 <ruonan1.wang@intel.com>
Date: Wed, 4 Dec 2024 20:01:00 +0800
Subject: [PATCH 14/21] fix style

---
 .../transformers/npu_models/mp_models_base.py        |  3 ++-
 .../ipex_llm/transformers/npu_pipeline_model/qwen.py | 12 +++++++-----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py b/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py
index 59cc2b1920e..3b9ef4d2de4 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py
@@ -64,7 +64,8 @@ def run_model(
                 op_args_flatten.append(op_args[-1][0])
                 op_args_flatten.append(op_args[-1][1])
             else:
-                op_args.append((set_contiguous(w[0]).numpy(), set_contiguous(w[1]).numpy(), set_contiguous(w[2]).numpy()))
+                op_args.append((set_contiguous(w[0]).numpy(), set_contiguous(w[1]).numpy(),
+                                set_contiguous(w[2]).numpy()))
                 op_args_flatten.append(op_args[-1][0])
                 op_args_flatten.append(op_args[-1][1])
                 op_args_flatten.append(op_args[-1][2])
diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py
index 40443ace85d..28f4cb8dc4d 100644
--- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py
+++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py
@@ -250,9 +250,9 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
 
             weights = []
             for layer_list in [attn_layer.q_proj_dq_list, attn_layer.k_proj_dq_list,
-                            attn_layer.v_proj_dq_list, attn_layer.o_proj_dq_list,
-                            mlp_layer.gate_proj_dq_list, mlp_layer.up_proj_dq_list,
-                            mlp_layer.down_proj_dq_list]:
+                               attn_layer.v_proj_dq_list, attn_layer.o_proj_dq_list,
+                               mlp_layer.gate_proj_dq_list, mlp_layer.up_proj_dq_list,
+                               mlp_layer.down_proj_dq_list]:
                 l_weights = []
                 scales = []
                 mins = []
@@ -295,14 +295,16 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
             # 6, 7 are past k/v
             if not asym:
                 for idx, (weight, scale) in enumerate(weights):
-                    bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*2}.bin")
+                    bin_file = os.path.join(weight_dir,
+                                            f"model_{layer_idx}_input_{st_idx+3+idx*2}.bin")
                     weight.numpy().tofile(bin_file)
                     bin_file = os.path.join(weight_dir,
                                             f"model_{layer_idx}_input_{st_idx+3+idx*2+1}.bin")
                     scale.numpy().tofile(bin_file)
             else:
                 for idx, (weight, scale, m) in enumerate(weights):
-                    bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*3}.bin")
+                    bin_file = os.path.join(weight_dir,
+                                            f"model_{layer_idx}_input_{st_idx+3+idx*3}.bin")
                     weight.numpy().tofile(bin_file)
                     bin_file = os.path.join(weight_dir,
                                             f"model_{layer_idx}_input_{st_idx+3+idx*3+1}.bin")

From 0450c7b2d846e0458a60b80338c1532bd1fb4829 Mon Sep 17 00:00:00 2001
From: rnwang04 <ruonan1.wang@intel.com>
Date: Thu, 5 Dec 2024 10:29:56 +0800
Subject: [PATCH 15/21] replace min with zero

---
 .../transformers/npu_models/convert.py        | 16 +++++------
 .../transformers/npu_models/linear.py         | 22 +++++++--------
 .../transformers/npu_models/qwen2_mp.py       | 20 ++++++-------
 .../transformers/npu_pipeline_model/qwen.py   | 28 +++++++++----------
 4 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert.py b/python/llm/src/ipex_llm/transformers/npu_models/convert.py
index cc641fbc3b2..4aa45b0054a 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/convert.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/convert.py
@@ -98,11 +98,11 @@ def replace_with_QuantizedLinear(layer, qtype, device, modules_to_not_convert,
                                              iqtype, device=device,
                                              enable_scale_search=enable_scale_search,
                                              imatrix=imatrix)
-        min = None
-        # split scale to scale & min
+        zero = None
+        # split scale to scale & zero
         if qtype == "asym_int4_rtn":
-            scale, min = torch.split(scale, scale.shape[0] // 2)
-        return QuantizedLinear(qweights, scale, min, layer.bias,
+            scale, zero = torch.split(scale, scale.shape[0] // 2)
+        return QuantizedLinear(qweights, scale, zero, layer.bias,
                                group_size=group_size, qtype=qtype)
 
 
@@ -124,11 +124,11 @@ def replace_with_DequantizedLinear(layer, qtype, device, modules_to_not_convert,
                                              iqtype, device=device,
                                              enable_scale_search=enable_scale_search,
                                              imatrix=imatrix)
-        min = None
-        # split scale to scale & min
+        zero = None
+        # split scale to scale & zero
         if qtype == "asym_int4_rtn":
-            scale, min = torch.split(scale, scale.shape[0] // 2)
-        return DequantizedLinear(qweights, scale, min, layer.bias, qtype)
+            scale, zero = torch.split(scale, scale.shape[0] // 2)
+        return DequantizedLinear(qweights, scale, zero, layer.bias, qtype)
 
 
 @module_optimization
diff --git a/python/llm/src/ipex_llm/transformers/npu_models/linear.py b/python/llm/src/ipex_llm/transformers/npu_models/linear.py
index 4decce2893b..eb9ae98c75e 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/linear.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/linear.py
@@ -129,7 +129,7 @@ def __init__(
         self,
         weight: torch.Tensor,
         scale: torch.Tensor,
-        min: Optional[torch.Tensor] = None,
+        zero: Optional[torch.Tensor] = None,
         bias: Optional[torch.Tensor] = None,
         qtype: Optional[str] = "sym_int4_rtn",
         group_size: int = 0,
@@ -139,7 +139,7 @@ def __init__(
         Args:
             weight (torch.Tensor): Linear operation weight
             scale (torch.Tensor): Quantization scale
-            min (Optional[torch.Tensor], optional): Quantization min for asym_int4_rtn
+            zero (Optional[torch.Tensor], optional): Quantization zero for asym_int4_rtn
             bias (Optional[torch.Tensor], optional): Linear operation optional bias.
                                                      Defaults to None.
             qtype (Optional[str], optional): qtype of this Linear
@@ -166,10 +166,10 @@ def __init__(
                 # Int4 we need to double the input channels because weights are compressed
                 self.inC *= 2
             self.scale = Parameter(scale * math.sqrt(self.inC), requires_grad=False)
-            if min is not None:
-                self.min = Parameter(min * math.sqrt(self.inC), requires_grad=False)
+            if zero is not None:
+                self.zero = Parameter(zero * math.sqrt(self.inC), requires_grad=False)
             else:
-                self.min = None
+                self.zero = None
         self.bias = bias
         self.qtype = qtype
         self.op_id = str(uuid.uuid4())
@@ -204,8 +204,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 )
             )
 
-        min_data = self.min.data if self.min is not None else None
-        out = run_matmul(x, self.weight.data, self.scale.data, min_data, self.op_id)
+        zero_data = self.zero.data if self.zero is not None else None
+        out = run_matmul(x, self.weight.data, self.scale.data, zero_data, self.op_id)
 
         if self.bias is None:
             return out
@@ -219,7 +219,7 @@ def __init__(
         self,
         weight: torch.Tensor,
         scale: torch.Tensor,
-        min: Optional[torch.Tensor] = None,
+        zero: Optional[torch.Tensor] = None,
         bias: Optional[torch.Tensor] = None,
         qtype: Optional[str] = "sym_int4_rtn",
     ):
@@ -227,7 +227,7 @@ def __init__(
         Args:
             weight (torch.Tensor): Linear operation quantized weight
             scale (torch.Tensor): Quantization scale
-            min (Optional[torch.Tensor], optional): Quantization min for asym_int4_rtn
+            zero (Optional[torch.Tensor], optional): Quantization zero for asym_int4_rtn
             bias (Optional[torch.Tensor], optional): Linear operation optional bias.
                                                      Defaults to None.
             qtype (Optional[str], optional): qtype of this Linear
@@ -254,8 +254,8 @@ def __init__(
             decompressed_weight = combined_weight.view(combined_weight.size(0), -1)
             dequantized_weight = decompressed_weight.to(torch.float32) * \
                 torch.unsqueeze(scale.to(torch.float32), dim=1)
-            if qtype == "asym_int4_rtn" and min is not None:
-                dequantized_weight = dequantized_weight + torch.unsqueeze(min.to(torch.float32),
+            if qtype == "asym_int4_rtn" and zero is not None:
+                dequantized_weight = dequantized_weight + torch.unsqueeze(zero.to(torch.float32),
                                                                           dim=1)
             self.weight = Parameter(dequantized_weight, requires_grad=False).contiguous()
         else:
diff --git a/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py
index d32f1751a98..397739cb72a 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py
@@ -601,15 +601,15 @@ def run_decode(
                            mlp_layer.down_proj_dq_list]:
             l_weights = []
             scales = []
-            mins = []
+            zeros = []
             for l in layer_list:
                 l_weights.append(l.weight)
                 scales.append(l.scale)
-                if l.min is not None:
-                    mins.append(l.min)
-            if len(mins):
+                if l.zero is not None:
+                    zeros.append(l.zero)
+            if len(zeros):
                 weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0),
-                                torch.stack(mins, axis=0)))
+                                torch.stack(zeros, axis=0)))
             else:
                 weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0)))
 
@@ -839,15 +839,15 @@ def run_prefill(
                            mlp_layer.down_proj_dq_list]:
             l_weights = []
             scales = []
-            mins = []
+            zeros = []
             for l in layer_list:
                 l_weights.append(l.weight)
                 scales.append(l.scale)
-                if l.min is not None:
-                    mins.append(l.min)
-            if len(mins):
+                if l.zero is not None:
+                    zeros.append(l.zero)
+            if len(zeros):
                 weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0),
-                                torch.stack(mins, axis=0)))
+                                torch.stack(zeros, axis=0)))
             else:
                 weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0)))
 
diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py
index 28f4cb8dc4d..6eb6b2553b7 100644
--- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py
+++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py
@@ -118,15 +118,15 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
                        mlp_layer.down_proj_dq_list]:
         l_weights = []
         scales = []
-        mins = []
+        zeros = []
         for l in layer_list:
             l_weights.append(l.weight)
             scales.append(l.scale)
-            if l.min is not None:
-                mins.append(l.min)
-        if len(mins):
+            if l.zero is not None:
+                zeros.append(l.zero)
+        if len(zeros):
             weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0),
-                            torch.stack(mins, axis=0)))
+                            torch.stack(zeros, axis=0)))
         else:
             weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0)))
 
@@ -205,7 +205,7 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
                                         f"model_{layer_idx}_input_{st_idx+3+idx*2+1}.bin")
                 scale.numpy().tofile(bin_file)
         else:
-            for idx, (weight, scale, m) in enumerate(weights):
+            for idx, (weight, scale, zero) in enumerate(weights):
                 bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*3}.bin")
                 weight.numpy().tofile(bin_file)
                 bin_file = os.path.join(weight_dir,
@@ -213,7 +213,7 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
                 scale.numpy().tofile(bin_file)
                 bin_file = os.path.join(weight_dir,
                                         f"model_{layer_idx}_input_{st_idx+3+idx*3+2}.bin")
-                m.numpy().tofile(bin_file)
+                zero.numpy().tofile(bin_file)
 
     del single_decoder
 
@@ -255,15 +255,15 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
                                mlp_layer.down_proj_dq_list]:
                 l_weights = []
                 scales = []
-                mins = []
+                zeros = []
                 for l in layer_list:
                     l_weights.append(l.weight)
                     scales.append(l.scale)
-                    if l.min is not None:
-                        mins.append(l.min)
-                if len(mins):
+                    if l.zero is not None:
+                        zeros.append(l.zero)
+                if len(zeros):
                     weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0),
-                                    torch.stack(mins, axis=0)))
+                                    torch.stack(zeros, axis=0)))
                 else:
                     weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0)))
 
@@ -302,7 +302,7 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
                                             f"model_{layer_idx}_input_{st_idx+3+idx*2+1}.bin")
                     scale.numpy().tofile(bin_file)
             else:
-                for idx, (weight, scale, m) in enumerate(weights):
+                for idx, (weight, scale, zero) in enumerate(weights):
                     bin_file = os.path.join(weight_dir,
                                             f"model_{layer_idx}_input_{st_idx+3+idx*3}.bin")
                     weight.numpy().tofile(bin_file)
@@ -311,7 +311,7 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
                     scale.numpy().tofile(bin_file)
                     bin_file = os.path.join(weight_dir,
                                             f"model_{layer_idx}_input_{st_idx+3+idx*3+2}.bin")
-                    m.numpy().tofile(bin_file)
+                    zero.numpy().tofile(bin_file)
 
         if isinstance(weights[0], tuple):
             np_dtype = np.int8 if weights[0][0].dtype == torch.int8 else np.uint8

From ecdd8274b0b32ffdb10a62aee8d0d601ff353b76 Mon Sep 17 00:00:00 2001
From: rnwang04 <ruonan1.wang@intel.com>
Date: Thu, 5 Dec 2024 15:04:09 +0800
Subject: [PATCH 16/21] support split linear for q4_1

---
 .../src/ipex_llm/transformers/npu_model.py    |  5 ---
 .../transformers/npu_models/convert_mp.py     | 15 +++++---
 .../transformers/npu_models/linear.py         |  4 +--
 .../transformers/npu_models/lm_head.py        | 34 +++++++++++++-----
 .../transformers/npu_models/mp_models_base.py |  8 +++--
 .../transformers/npu_pipeline_model/common.py |  2 ++
 .../npu_pipeline_model/convert_pipeline.py    | 15 ++++++--
 .../transformers/npu_pipeline_model/qwen.py   | 35 +++++++++++++++----
 8 files changed, 85 insertions(+), 33 deletions(-)

diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py
index 657daa0aa3e..9744e2f85f1 100644
--- a/python/llm/src/ipex_llm/transformers/npu_model.py
+++ b/python/llm/src/ipex_llm/transformers/npu_model.py
@@ -156,11 +156,6 @@ def from_pretrained(cls, *args, **kwargs):
             )
         )
 
-        if low_bit == "asym_int4":
-            invalidInputError(quantization_group_size == 0,
-                              "asym_int4 only support quantization_group_size == 0 for now.")
-
-        _args = copy.deepcopy(args)
         _kwargs = copy.deepcopy(kwargs)
 
         try:
diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py
index 0d1089963c9..4edfd8aee01 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py
@@ -154,18 +154,21 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision,
                 # workaround for MiniCPM-2B
                 new_lm_head_0 = SlicedLMHead(model.lm_head_0.weight, split_num=split_num,
                                              bias=model.lm_head_0.bias, use_split=True,
-                                             group_size=quantization_group_size)
+                                             group_size=quantization_group_size,
+                                             asym=(qtype == "asym_int4_rtn"))
                 del model.lm_head_0
                 model.lm_head_0 = new_lm_head_0
                 new_lm_head_1 = SlicedLMHead(model.lm_head_1.weight, split_num=split_num,
                                              bias=model.lm_head_1.bias, use_split=True,
-                                             group_size=quantization_group_size)
+                                             group_size=quantization_group_size,
+                                             asym=(qtype == "asym_int4_rtn"))
                 del model.lm_head_1
                 model.lm_head_1 = new_lm_head_1
             else:
                 new_lm_head = SlicedLMHead(model.lm_head.weight, split_num=split_num,
                                            bias=model.lm_head.bias, use_split=True,
-                                           group_size=quantization_group_size)
+                                           group_size=quantization_group_size,
+                                           asym=(qtype == "asym_int4_rtn"))
                 del model.lm_head
                 model.lm_head = new_lm_head
 
@@ -176,11 +179,13 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision,
             # Do not split lm_head and use sym_int8 instead when mixed_precison is True
             if quantization_group_size == 0:
                 # Do not split lm_head and use sym_int8 instead when mixed_precison is True
-                is_split = (not mixed_precision) and qtype == "sym_int4_rtn"
+                is_split = (not mixed_precision) and qtype in ["sym_int4_rtn", "asym_int4_rtn"]
                 split_num = 14 if is_split else 1
+                print("enter here, split num is ", split_num)
                 new_lm_head = SlicedLMHead(model.lm_head.weight, split_num=split_num,
                                            bias=model.lm_head.bias, use_split=True,
-                                           group_size=quantization_group_size)
+                                           group_size=quantization_group_size,
+                                           asym=(qtype == "asym_int4_rtn"))
             del model.lm_head
             model.lm_head = new_lm_head
 
diff --git a/python/llm/src/ipex_llm/transformers/npu_models/linear.py b/python/llm/src/ipex_llm/transformers/npu_models/linear.py
index eb9ae98c75e..c8a5dd467ae 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/linear.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/linear.py
@@ -159,8 +159,10 @@ def __init__(
                 )
             )
         self.outC, self.inC = self.weight.shape
+        self.zero = None
         if group_size != 0:
             self.scale = Parameter(scale, requires_grad=False)
+            self.zero = Parameter(zero, requires_grad=False)
         else:
             if self.weight.dtype == torch.uint8:
                 # Int4 we need to double the input channels because weights are compressed
@@ -168,8 +170,6 @@ def __init__(
             self.scale = Parameter(scale * math.sqrt(self.inC), requires_grad=False)
             if zero is not None:
                 self.zero = Parameter(zero * math.sqrt(self.inC), requires_grad=False)
-            else:
-                self.zero = None
         self.bias = bias
         self.qtype = qtype
         self.op_id = str(uuid.uuid4())
diff --git a/python/llm/src/ipex_llm/transformers/npu_models/lm_head.py b/python/llm/src/ipex_llm/transformers/npu_models/lm_head.py
index f306ae0e4e0..da1d8c9d37e 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/lm_head.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/lm_head.py
@@ -36,6 +36,7 @@ def __init__(
         dtype: np.dtype = np.int8,
         use_split: bool = False,
         group_size: int = 0,
+        asym: bool = False,
     ):
         """Initialize the LMHeadLinear class.
 
@@ -58,7 +59,7 @@ def __init__(
         if use_split:
             input = self.parameter((1, self.batch, self.inC))
             res = self.dq_split_linear(input, self.split_num, self.outC, self.inC, wt_dtype=dtype,
-                                       scale_factor=(group_size == 0))
+                                       scale_factor=(group_size == 0), asym=asym)
         else:
             input = self.parameter((self.batch, self.inC))
             split_size = self.inC // split_num // 2 * 2
@@ -69,7 +70,7 @@ def __init__(
                 input_slice = self.slice(input, begin=[0, start_idx],
                                          end=[self.batch, end_idx])
                 linear_slice = self.linear(input_slice, outC, split_size, bias=False,
-                                           wt_dtype=dtype)
+                                           wt_dtype=dtype, asym=asym)
                 if i == 0:
                     res = linear_slice
                 else:
@@ -109,7 +110,7 @@ def run(
 
 
 class SlicedLMHead(nn.Module):
-    def __init__(self, weight, bias, split_num, use_split=False, group_size=0):
+    def __init__(self, weight, bias, split_num, use_split=False, group_size=0, asym=False):
         super().__init__()
         self.split_num = split_num
         self.outC, self.inC = weight.shape
@@ -128,6 +129,7 @@ def __init__(self, weight, bias, split_num, use_split=False, group_size=0):
             self.lm_heads.append(new_linear)
         self.bias = bias
         self.use_split = use_split
+        self.asym = asym
 
     def forward(self, hidden_states):
         if hidden_states.size(0) * hidden_states.size(1) == 1:
@@ -162,19 +164,33 @@ def get_fused_lm_head(self):
         np_dtype = np.uint8 if self.get_weight_dtype() == torch.uint8 else np.int8
         self.fused_lm_head = LMHeadLinear(self.inC, self.outC, 1, self.split_num,
                                           False, "NPU", dtype=np_dtype, use_split=self.use_split,
-                                          group_size=self.group_size)
+                                          group_size=self.group_size, asym=self.asym)
         if self.use_split:
             weights = []
             scales = []
+            zeros = []
             for i in range(self.split_num):
                 weights.append(self.lm_heads[i].weight)
                 scales.append(self.lm_heads[i].scale)
-            fused_lm_head_weights = (torch.stack(weights, axis=0).numpy(),
-                                     torch.stack(scales, axis=0).numpy())
+                if self.asym:
+                    zeros.append(self.lm_heads[i].zero)
+            if self.asym:
+                fused_lm_head_weights = (torch.stack(weights, axis=0).numpy(),
+                                         torch.stack(scales, axis=0).numpy(),
+                                         torch.stack(zeros, axis=0).numpy(),)
+            else:
+                fused_lm_head_weights = (torch.stack(weights, axis=0).numpy(),
+                                         torch.stack(scales, axis=0).numpy())
         else:
-            fused_lm_head_weights = [(self.lm_heads[i].weight.data.numpy(),
-                                      self.lm_heads[i].scale.data.numpy())
-                                     for i in range(self.split_num)]
+            if self.asym:
+                fused_lm_head_weights = [(self.lm_heads[i].weight.data.numpy(),
+                                          self.lm_heads[i].scale.data.numpy(),
+                                          self.lm_heads[i].zero.data.numpy())
+                                         for i in range(self.split_num)]
+            else:
+                fused_lm_head_weights = [(self.lm_heads[i].weight.data.numpy(),
+                                          self.lm_heads[i].scale.data.numpy())
+                                         for i in range(self.split_num)]
 
         self.fused_lm_head.set_weights(self.lm_heads[0].op_id,
                                        fused_lm_head_weights)
diff --git a/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py b/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py
index 3b9ef4d2de4..a1dac609243 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py
@@ -585,7 +585,8 @@ def linear(self,
                                          output_channels, input_channels,
                                          bias=bias, act_dtype=act_dtype,
                                          wt_dtype=wt_dtype, scale_factor=scale_factor,
-                                         is_prefill=is_prefill)
+                                         is_prefill=is_prefill,
+                                         asym=asym)
         self.linear_ops.append(op)
         return op
 
@@ -597,10 +598,11 @@ def dq_split_linear(self,
                         act_dtype: npt.DTypeLike = np.float16,
                         wt_dtype: npt.DTypeLike = np.float16,
                         scale_factor: bool = False,
-                        is_prefill: bool = False):
+                        is_prefill: bool = False,
+                        asym: bool = False):
         op = super().dq_split_linear(input_node, n_splits, output_channels, input_channels,
                                      False, act_dtype, wt_dtype, scale_factor,
-                                     is_prefill=is_prefill)
+                                     is_prefill=is_prefill, asym=asym)
         self.linear_ops.append(op)
         return op
 
diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py
index b38299473d4..87459a99e98 100644
--- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py
+++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py
@@ -86,6 +86,7 @@ def __init__(
         device: str = "NPU",
         n_splits: int = 1,
         group_size: int = 0,
+        asym: bool = False
     ):
         super().__init__(max_seq_len=max_seq_len,
                          transpose_value=transpose_value,
@@ -119,6 +120,7 @@ def __init__(
             hidden_states, self.vocab_size, self.hidden_size, bias=False, wt_dtype=self.dtype,
             n_splits=n_splits,
             scale_factor=(group_size == 0),
+            asym=asym
         )
 
         # define outputs
diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py
index ee773614b63..337736a7ea8 100644
--- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py
+++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py
@@ -434,6 +434,12 @@ def convert_llm_for_deploy(model: torch.nn.Module,
         os.mkdir(weight_dir)
     layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
 
+    lm_head_low_bit = getattr(model.config, "bigdl_transformers_low_bit", "sym_int4_rtn")
+    if not isinstance(model.lm_head, SlicedLMHead):
+        lm_head_low_bit = model.lm_head.qtype
+    else:
+        lm_head_low_bit = model.lm_head.lm_heads[0].qtype
+
     if model.config.model_type == "qwen2":
         if group_size == 0:
             if model.config.hidden_size == 1536:
@@ -456,7 +462,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
                        "weight_num": 7,
                        "weight_idx": 8,
                        "n_splits_linear": n_splits_linear,
-                       "n_splits_down_proj": n_splits_down_proj}
+                       "n_splits_down_proj": n_splits_down_proj,
+                       "lm_head_low_bit": lm_head_low_bit}
         model.config.update(update_dict)
         model.config.save_pretrained(save_directory)
 
@@ -517,7 +524,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
                        "embedding_post": embedding_post,
                        "cos_sin_input": cos_sin_input,
                        "n_splits_linear": n_splits_linear,
-                       "n_splits_down_proj": n_splits_down_proj}
+                       "n_splits_down_proj": n_splits_down_proj,
+                       "lm_head_low_bit": lm_head_low_bit}
         model.config.update(update_dict)
         model.config.save_pretrained(save_directory)
 
@@ -556,7 +564,8 @@ def convert_llm_for_deploy(model: torch.nn.Module,
                        "model_type": "minicpm",
                        "embedding_post": True,
                        "n_splits_linear": n_splits_linear,
-                       "n_splits_down_proj": n_splits_down_proj}
+                       "n_splits_down_proj": n_splits_down_proj,
+                       "lm_head_low_bit": lm_head_low_bit}
         model.config.update(update_dict)
         model.config.save_pretrained(save_directory)
 
diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py
index 6eb6b2553b7..38c86a63101 100644
--- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py
+++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py
@@ -31,17 +31,33 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
     model_norm = model.model.norm
     lm_head = model.lm_head
     lm_head_n_splits = 1
+    asym = getattr(model.config, "asym", False)
+
     if not isinstance(lm_head, SlicedLMHead):
-        weights = [(lm_head.weight, lm_head.scale)]
+        asym = lm_head.qtype == "asym_int4_rtn"
+        if asym:
+            weights = [(lm_head.weight, lm_head.scale, lm_head.zero)]
+        else:
+            weights = [(lm_head.weight, lm_head.scale)]
     else:
         lm_heads = lm_head.lm_heads
+        asym = lm_heads[0].qtype == "asym_int4_rtn"
+        print("asym is ", asym, lm_heads[0].qtype)
         lm_head_weights = []
         scales = []
+        zeros = []
         for l in lm_heads:
             lm_head_weights.append(l.weight)
             scales.append(l.scale)
-        weights = [(torch.stack(lm_head_weights, axis=0),
-                    torch.stack(scales, axis=0))]
+            if l.zero is not None:
+                zeros.append(l.zero)
+        if len(zeros):
+            weights = [(torch.stack(lm_head_weights, axis=0),
+                        torch.stack(scales, axis=0),
+                        torch.stack(zeros, axis=0))]
+        else:
+            weights = [(torch.stack(lm_head_weights, axis=0),
+                        torch.stack(scales, axis=0))]
         lm_head_n_splits = lm_head.split_num
     if isinstance(weights[0], tuple):
         np_dtype = np.int8 if weights[0][0].dtype == torch.int8 else np.uint8
@@ -60,6 +76,7 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
         vocab_size=vocab_size,
         n_splits=lm_head_n_splits,
         group_size=group_size,
+        asym=asym
     )
 
     last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, f"lm_head",
@@ -67,9 +84,15 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
 
     # save weights bins files
     if not isinstance(lm_head, SlicedLMHead):
-        weight_numpy = [
-            lm_head.weight.data.numpy(), lm_head.scale.data.numpy(),
-        ]
+        if not asym:
+            weight_numpy = [
+                lm_head.weight.data.numpy(), lm_head.scale.data.numpy(),
+            ]
+        else:
+            weight_numpy = [
+                lm_head.weight.data.numpy(), lm_head.scale.data.numpy(),
+                lm_head.zero.data.numpy()
+            ]
     else:
         weight_numpy = [v.numpy() for v in weights[0]]
 

From b1a8e81c77b2033716f4ba0a668eec1af9d62aaa Mon Sep 17 00:00:00 2001
From: rnwang04 <ruonan1.wang@intel.com>
Date: Thu, 5 Dec 2024 16:06:33 +0800
Subject: [PATCH 17/21] fix lm_head with mixed_precision=True

---
 .../src/ipex_llm/transformers/npu_model.py    | 46 +++++++++----------
 .../transformers/npu_models/convert_mp.py     |  3 +-
 .../transformers/npu_models/lm_head.py        | 15 +++---
 3 files changed, 31 insertions(+), 33 deletions(-)

diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py
index 9744e2f85f1..983c6393496 100644
--- a/python/llm/src/ipex_llm/transformers/npu_model.py
+++ b/python/llm/src/ipex_llm/transformers/npu_model.py
@@ -284,29 +284,29 @@ def optimize_npu_model(cls, *args, **kwargs):
         model.share_memory()
 
         if not pipeline:
-            if (not hasattr(model, 'llm') and
-                    model.config.model_type in ["qwen2", "llama", "minicpm"]):
-                from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
-                optimize_llm_single_process(
-                    llm,
-                    kv_len=max_context_len,
-                    max_prompt_len=max_prompt_len,
-                    transpose_value_cache=transpose_value_cache,
-                    group_size=quantization_group_size,
-                    qtype=qtype,
-                    save_directory=save_directory,
-                    fuse_layers=fuse_layers
-                )
-            else:
-                optimize_llm(
-                    llm,
-                    max_context_len=max_context_len,
-                    max_prompt_len=max_prompt_len,
-                    inter_pp=inter_pp,
-                    intra_pp=intra_pp,
-                    transpose_value_cache=transpose_value_cache,
-                    group_size=quantization_group_size
-                )
+            # if (not hasattr(model, 'llm') and
+            #         model.config.model_type in ["qwen2", "llama", "minicpm"]):
+            #     from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
+            #     optimize_llm_single_process(
+            #         llm,
+            #         kv_len=max_context_len,
+            #         max_prompt_len=max_prompt_len,
+            #         transpose_value_cache=transpose_value_cache,
+            #         group_size=quantization_group_size,
+            #         qtype=qtype,
+            #         save_directory=save_directory,
+            #         fuse_layers=fuse_layers
+            #     )
+            # else:
+            optimize_llm(
+                llm,
+                max_context_len=max_context_len,
+                max_prompt_len=max_prompt_len,
+                inter_pp=inter_pp,
+                intra_pp=intra_pp,
+                transpose_value_cache=transpose_value_cache,
+                group_size=quantization_group_size
+            )
         else:
             from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
                 import convert_llm
diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py
index 4edfd8aee01..12a4da5b512 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py
@@ -181,11 +181,10 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision,
                 # Do not split lm_head and use sym_int8 instead when mixed_precison is True
                 is_split = (not mixed_precision) and qtype in ["sym_int4_rtn", "asym_int4_rtn"]
                 split_num = 14 if is_split else 1
-                print("enter here, split num is ", split_num)
                 new_lm_head = SlicedLMHead(model.lm_head.weight, split_num=split_num,
                                            bias=model.lm_head.bias, use_split=True,
                                            group_size=quantization_group_size,
-                                           asym=(qtype == "asym_int4_rtn"))
+                                           asym=(qtype == "asym_int4_rtn") and (not mixed_precision))
             del model.lm_head
             model.lm_head = new_lm_head
 
diff --git a/python/llm/src/ipex_llm/transformers/npu_models/lm_head.py b/python/llm/src/ipex_llm/transformers/npu_models/lm_head.py
index da1d8c9d37e..0184805996b 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/lm_head.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/lm_head.py
@@ -55,7 +55,6 @@ def __init__(
         self.batch = batch
 
         self.split_num = split_num
-
         if use_split:
             input = self.parameter((1, self.batch, self.inC))
             res = self.dq_split_linear(input, self.split_num, self.outC, self.inC, wt_dtype=dtype,
@@ -172,15 +171,15 @@ def get_fused_lm_head(self):
             for i in range(self.split_num):
                 weights.append(self.lm_heads[i].weight)
                 scales.append(self.lm_heads[i].scale)
-                if self.asym:
+                if self.lm_heads[i].zero is not None:
                     zeros.append(self.lm_heads[i].zero)
-            if self.asym:
-                fused_lm_head_weights = (torch.stack(weights, axis=0).numpy(),
-                                         torch.stack(scales, axis=0).numpy(),
-                                         torch.stack(zeros, axis=0).numpy(),)
+            if len(zeros):
+                fused_lm_head_weights = [(torch.stack(weights, axis=0).numpy(),
+                                          torch.stack(scales, axis=0).numpy(),
+                                          torch.stack(zeros, axis=0).numpy())]
             else:
-                fused_lm_head_weights = (torch.stack(weights, axis=0).numpy(),
-                                         torch.stack(scales, axis=0).numpy())
+                fused_lm_head_weights = [(torch.stack(weights, axis=0).numpy(),
+                                          torch.stack(scales, axis=0).numpy())]
         else:
             if self.asym:
                 fused_lm_head_weights = [(self.lm_heads[i].weight.data.numpy(),

From 4e56e795f689fd36d84fcb726c660f8384c0c9d6 Mon Sep 17 00:00:00 2001
From: rnwang04 <ruonan1.wang@intel.com>
Date: Thu, 5 Dec 2024 16:48:36 +0800
Subject: [PATCH 18/21] fix style

---
 python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py
index 12a4da5b512..64d6f30b160 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py
@@ -184,7 +184,8 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision,
                 new_lm_head = SlicedLMHead(model.lm_head.weight, split_num=split_num,
                                            bias=model.lm_head.bias, use_split=True,
                                            group_size=quantization_group_size,
-                                           asym=(qtype == "asym_int4_rtn") and (not mixed_precision))
+                                           asym=((qtype == "asym_int4_rtn") and
+                                                 (not mixed_precision)))
             del model.lm_head
             model.lm_head = new_lm_head
 

From 558e10100e167cd71deb4206a0436e657688c88f Mon Sep 17 00:00:00 2001
From: rnwang04 <ruonan1.wang@intel.com>
Date: Thu, 5 Dec 2024 16:54:05 +0800
Subject: [PATCH 19/21] revert test code

---
 .../src/ipex_llm/transformers/npu_model.py    | 46 +++++++++----------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py
index 983c6393496..9744e2f85f1 100644
--- a/python/llm/src/ipex_llm/transformers/npu_model.py
+++ b/python/llm/src/ipex_llm/transformers/npu_model.py
@@ -284,29 +284,29 @@ def optimize_npu_model(cls, *args, **kwargs):
         model.share_memory()
 
         if not pipeline:
-            # if (not hasattr(model, 'llm') and
-            #         model.config.model_type in ["qwen2", "llama", "minicpm"]):
-            #     from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
-            #     optimize_llm_single_process(
-            #         llm,
-            #         kv_len=max_context_len,
-            #         max_prompt_len=max_prompt_len,
-            #         transpose_value_cache=transpose_value_cache,
-            #         group_size=quantization_group_size,
-            #         qtype=qtype,
-            #         save_directory=save_directory,
-            #         fuse_layers=fuse_layers
-            #     )
-            # else:
-            optimize_llm(
-                llm,
-                max_context_len=max_context_len,
-                max_prompt_len=max_prompt_len,
-                inter_pp=inter_pp,
-                intra_pp=intra_pp,
-                transpose_value_cache=transpose_value_cache,
-                group_size=quantization_group_size
-            )
+            if (not hasattr(model, 'llm') and
+                    model.config.model_type in ["qwen2", "llama", "minicpm"]):
+                from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
+                optimize_llm_single_process(
+                    llm,
+                    kv_len=max_context_len,
+                    max_prompt_len=max_prompt_len,
+                    transpose_value_cache=transpose_value_cache,
+                    group_size=quantization_group_size,
+                    qtype=qtype,
+                    save_directory=save_directory,
+                    fuse_layers=fuse_layers
+                )
+            else:
+                optimize_llm(
+                    llm,
+                    max_context_len=max_context_len,
+                    max_prompt_len=max_prompt_len,
+                    inter_pp=inter_pp,
+                    intra_pp=intra_pp,
+                    transpose_value_cache=transpose_value_cache,
+                    group_size=quantization_group_size
+                )
         else:
             from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
                 import convert_llm

From f1f1bb2953d2fa6c96bfe073e94f57be5e54a706 Mon Sep 17 00:00:00 2001
From: rnwang04 <ruonan1.wang@intel.com>
Date: Thu, 5 Dec 2024 17:14:11 +0800
Subject: [PATCH 20/21] add down proj back for q4_0

---
 python/llm/src/ipex_llm/transformers/npu_models/convert.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert.py b/python/llm/src/ipex_llm/transformers/npu_models/convert.py
index 4aa45b0054a..2842799b160 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/convert.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/convert.py
@@ -93,6 +93,10 @@ def replace_with_QuantizedLinear(layer, qtype, device, modules_to_not_convert,
             if (layer.in_features == 3584 and layer.out_features == 152064):
                 qtype = "sym_int8_rtn"
                 iqtype = ggml_tensor_qtype[qtype]
+        if qtype == "sym_int4_rtn":
+            if (layer.in_features == 18944 and layer.out_features == 3584):
+                qtype = "sym_int8_rtn"
+                iqtype = ggml_tensor_qtype[qtype]
         enable_scale_search = os.environ.get("IPEX_LLM_NPU_QUANTIZATION_OPT", "0") != "0"
         qweights, scale = ggml_convert_qtype(layer.weight.data.to(torch.float32),
                                              iqtype, device=device,

From 17a896fd446c8ce59e006ddd29fac22335b8480e Mon Sep 17 00:00:00 2001
From: rnwang04 <ruonan1.wang@intel.com>
Date: Thu, 5 Dec 2024 17:22:51 +0800
Subject: [PATCH 21/21] remove print

---
 python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py
index 38c86a63101..bb8003f06a7 100644
--- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py
+++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py
@@ -42,7 +42,6 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir,
     else:
         lm_heads = lm_head.lm_heads
         asym = lm_heads[0].qtype == "asym_int4_rtn"
-        print("asym is ", asym, lm_heads[0].qtype)
         lm_head_weights = []
         scales = []
         zeros = []