From ff340b5c4388016070f980dfd569627b4a471030 Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Tue, 3 Dec 2024 15:35:47 +0800 Subject: [PATCH 01/21] initiail support of q4_1 --- python/llm/src/ipex_llm/ggml/quantize.py | 1 + .../ipex_llm/transformers/low_bit_linear.py | 16 +++++++++++----- .../src/ipex_llm/transformers/npu_model.py | 1 + .../transformers/npu_models/convert.py | 19 ++++++++++++++++--- .../transformers/npu_models/linear.py | 15 +++++++++++++++ 5 files changed, 44 insertions(+), 8 deletions(-) diff --git a/python/llm/src/ipex_llm/ggml/quantize.py b/python/llm/src/ipex_llm/ggml/quantize.py index 76702e88117..a95e3464e32 100644 --- a/python/llm/src/ipex_llm/ggml/quantize.py +++ b/python/llm/src/ipex_llm/ggml/quantize.py @@ -52,6 +52,7 @@ "fp6_k": 30, "sym_int4_rtn": 31, "sym_int8_rtn": 32, + "asym_int4_rtn": 33, } # mixed precison from llama.cpp diff --git a/python/llm/src/ipex_llm/transformers/low_bit_linear.py b/python/llm/src/ipex_llm/transformers/low_bit_linear.py index 82fbdf6f506..ed44140d708 100644 --- a/python/llm/src/ipex_llm/transformers/low_bit_linear.py +++ b/python/llm/src/ipex_llm/transformers/low_bit_linear.py @@ -84,8 +84,10 @@ FP6_K = ggml_tensor_qtype["fp6_k"] SYM_INT4_RTN = ggml_tensor_qtype["sym_int4_rtn"] SYM_INT8_RTN = ggml_tensor_qtype["sym_int8_rtn"] +ASYM_INT4_RTN = ggml_tensor_qtype["asym_int4_rtn"] RTN_DTYPE = { SYM_INT4_RTN: torch.uint8, + ASYM_INT4_RTN: torch.uint8, SYM_INT8_RTN: torch.int8, } @@ -223,12 +225,16 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int, f"Last dim of input tensor must be multiple of {QK}") dst_size = (n // QK) * block_size_in_bytes - if qtype in [SYM_INT8_RTN, SYM_INT4_RTN]: + if qtype in [SYM_INT8_RTN, SYM_INT4_RTN, ASYM_INT4_RTN]: dst_tensor = torch.empty(dst_size, dtype=RTN_DTYPE[qtype], device=device) dst_tensor = dst_tensor.reshape(tensor.shape[0], tensor.shape[-1] // QK) - scale = torch.empty(n // k, dtype=torch.float32, - device=device) + if qtype == ASYM_INT4_RTN: + scale = torch.empty((n // k) * 2, dtype=torch.float32, + device=device) + else: + scale = torch.empty(n // k, dtype=torch.float32, + device=device) elif qtype == NF4: # Deepspeed zero3 requires unified dtype, # thus here uses bfloat16 consistent to other layers @@ -244,7 +250,7 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int, dst = ctypes.c_void_p(dst_tensor.data.data_ptr()) hist = (ctypes.c_int64 * 16)() if qtype not in [IQ2_XXS, IQ2_XS, Q2_K, IQ1_S, Q4_K, Q6_K, Q5_K, FP6_K]: - if qtype in [SYM_INT8_RTN, SYM_INT4_RTN]: + if qtype in [SYM_INT8_RTN, SYM_INT4_RTN, ASYM_INT4_RTN]: scale_ptr = ctypes.cast(scale.data.data_ptr(), ctypes.POINTER(ctypes.c_float)) if imatrix is None: ggml.ggml_quantize_tensor_rtn(src, dst, scale_ptr, qtype, n, @@ -269,7 +275,7 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int, ggml.ggml_quantize_tensor_with_weights(src, dst, qtype, n // in_features, in_features, hist, imatrix) - if qtype in [SYM_INT8_RTN, SYM_INT4_RTN]: + if qtype in [SYM_INT8_RTN, SYM_INT4_RTN, ASYM_INT4_RTN]: return dst_tensor, scale.type(torch.float16) else: return dst_tensor diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index 9dbbd1b8fde..f1e14a78f89 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -103,6 +103,7 @@ def from_pretrained(cls, *args, **kwargs): qtype_map = { "sym_int4": "sym_int4_rtn", "sym_int8": "sym_int8_rtn", + "asym_int4": "asym_int4_rtn", } invalidInputError( diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert.py b/python/llm/src/ipex_llm/transformers/npu_models/convert.py index 9ac0c9a6dda..8d5339575a0 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/convert.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/convert.py @@ -99,8 +99,12 @@ def replace_with_QuantizedLinear(layer, qtype, device, modules_to_not_convert, iqtype, device=device, enable_scale_search=enable_scale_search, imatrix=imatrix) - return QuantizedLinear(qweights, scale, layer.bias, - group_size=group_size) + min = None + # split scale to scale & min + if qtype == "asym_int4_rtn": + scale, min = torch.split(scale, scale.shape[0] // 2) + return QuantizedLinear(qweights, scale, min, layer.bias, + group_size=group_size, qtype=qtype) @module_optimization @@ -111,12 +115,21 @@ def replace_with_DequantizedLinear(layer, qtype, device, modules_to_not_convert, from ipex_llm.ggml.quantize import ggml_tensor_qtype iqtype = ggml_tensor_qtype[qtype] if isinstance(layer, torch.nn.Linear) and not hasattr(layer, "qtype"): + if qtype == "sym_int4_rtn": + # workaround for qwen2-7B & int4 + if (layer.in_features == 3584 and layer.out_features == 152064): + qtype = "sym_int8_rtn" + iqtype = ggml_tensor_qtype[qtype] enable_scale_search = os.environ.get("IPEX_LLM_NPU_QUANTIZATION_OPT", "0") != "0" qweights, scale = ggml_convert_qtype(layer.weight.data.to(torch.float32), iqtype, device=device, enable_scale_search=enable_scale_search, imatrix=imatrix) - return DequantizedLinear(qweights, scale, layer.bias) + min = None + # split scale to scale & min + if qtype == "asym_int4_rtn": + scale, min = torch.split(scale, scale.shape[0] // 2) + return DequantizedLinear(qweights, scale, min, layer.bias, qtype) @module_optimization diff --git a/python/llm/src/ipex_llm/transformers/npu_models/linear.py b/python/llm/src/ipex_llm/transformers/npu_models/linear.py index 2c4b5f37738..60ba4b7ba0f 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/linear.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/linear.py @@ -129,7 +129,9 @@ def __init__( self, weight: torch.Tensor, scale: torch.Tensor, + min: Optional[torch.Tensor] = None, bias: Optional[torch.Tensor] = None, + qtype: Optional[str] = "sym_int4_rtn", group_size: int = 0, ): """Initialize the QuantizedLinear class. @@ -137,8 +139,10 @@ def __init__( Args: weight (torch.Tensor): Linear operation weight scale (torch.Tensor): Quantization scale + min (Optional[torch.Tensor], optional): Quantization min for asym_int4_rtn bias (Optional[torch.Tensor], optional): Linear operation optional bias. Defaults to None. + qtype (Optional[str], optional): qtype of this Linear Raises: RuntimeError: Quantized weight must be in torch.int8 format @@ -163,6 +167,8 @@ def __init__( self.inC *= 2 self.scale = Parameter(scale * math.sqrt(self.inC), requires_grad=False) self.bias = bias + self.min = min + self.qtype = qtype self.op_id = str(uuid.uuid4()) def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -197,6 +203,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: out = run_matmul(x, self.weight.data, self.scale.data, self.op_id) + if self.qtype == "asym_int4_rtn" and self.min is not None: + out = out + self.min + if self.bias is None: return out return out + self.bias @@ -209,14 +218,18 @@ def __init__( self, weight: torch.Tensor, scale: torch.Tensor, + min: Optional[torch.Tensor] = None, bias: Optional[torch.Tensor] = None, + qtype: Optional[str] = "sym_int4_rtn", ): """Initialize the DequantizedLinear class. Args: weight (torch.Tensor): Linear operation quantized weight scale (torch.Tensor): Quantization scale + min (Optional[torch.Tensor], optional): Quantization min for asym_int4_rtn bias (Optional[torch.Tensor], optional): Linear operation optional bias. Defaults to None. + qtype (Optional[str], optional): qtype of this Linear Raises: RuntimeError: Quantized weight must be in torch.int8 format """ @@ -240,6 +253,8 @@ def __init__( decompressed_weight = combined_weight.view(combined_weight.size(0), -1) dequantized_weight = decompressed_weight.to(torch.float32) * \ torch.unsqueeze(scale.to(torch.float32), dim=1) + if qtype == "asym_int4_rtn" and min is not None: + dequantized_weight = dequantized_weight + torch.unsqueeze(min.to(torch.float32), dim=1) self.weight = Parameter(dequantized_weight, requires_grad=False).contiguous() else: dequantized_weight = weight.to(torch.float32) * \ From 1e423049d0c133c1d04aab276bec57f1441c0091 Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Tue, 3 Dec 2024 16:41:26 +0800 Subject: [PATCH 02/21] fix --- python/llm/src/ipex_llm/transformers/npu_models/convert.py | 2 +- python/llm/src/ipex_llm/transformers/npu_models/linear.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert.py b/python/llm/src/ipex_llm/transformers/npu_models/convert.py index 8d5339575a0..8fd0704b194 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/convert.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/convert.py @@ -115,7 +115,7 @@ def replace_with_DequantizedLinear(layer, qtype, device, modules_to_not_convert, from ipex_llm.ggml.quantize import ggml_tensor_qtype iqtype = ggml_tensor_qtype[qtype] if isinstance(layer, torch.nn.Linear) and not hasattr(layer, "qtype"): - if qtype == "sym_int4_rtn": + if qtype in ["sym_int4_rtn", "asym_int4_rtn"]: # workaround for qwen2-7B & int4 if (layer.in_features == 3584 and layer.out_features == 152064): qtype = "sym_int8_rtn" diff --git a/python/llm/src/ipex_llm/transformers/npu_models/linear.py b/python/llm/src/ipex_llm/transformers/npu_models/linear.py index 60ba4b7ba0f..6eb3835ace8 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/linear.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/linear.py @@ -245,7 +245,8 @@ def __init__( ) if weight.dtype == torch.uint8: - weight = weight.view(torch.int8) + if qtype == "sym_int_rtn": + weight = weight.view(torch.int8) high_4bits = weight >> 4 low_4bits = (weight << 4) >> 4 From 77321a736e0d8db6696aab58c043365b0d441cde Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Tue, 3 Dec 2024 17:29:06 +0800 Subject: [PATCH 03/21] fix --- python/llm/src/ipex_llm/transformers/npu_models/linear.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_models/linear.py b/python/llm/src/ipex_llm/transformers/npu_models/linear.py index 6eb3835ace8..0b636b098d6 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/linear.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/linear.py @@ -245,7 +245,7 @@ def __init__( ) if weight.dtype == torch.uint8: - if qtype == "sym_int_rtn": + if qtype == "sym_int4_rtn": weight = weight.view(torch.int8) high_4bits = weight >> 4 low_4bits = (weight << 4) >> 4 @@ -255,7 +255,8 @@ def __init__( dequantized_weight = decompressed_weight.to(torch.float32) * \ torch.unsqueeze(scale.to(torch.float32), dim=1) if qtype == "asym_int4_rtn" and min is not None: - dequantized_weight = dequantized_weight + torch.unsqueeze(min.to(torch.float32), dim=1) + dequantized_weight = dequantized_weight + torch.unsqueeze(min.to(torch.float32), + dim=1) self.weight = Parameter(dequantized_weight, requires_grad=False).contiguous() else: dequantized_weight = weight.to(torch.float32) * \ From 21d9811e6c92053dec82dc0fe16a358b09a94d29 Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Wed, 4 Dec 2024 09:43:55 +0800 Subject: [PATCH 04/21] update --- python/llm/src/ipex_llm/transformers/npu_model.py | 2 +- .../src/ipex_llm/transformers/npu_models/linear.py | 12 +++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index f1e14a78f89..21afc14ca90 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -221,7 +221,7 @@ def from_pretrained(cls, *args, **kwargs): model = cls.optimize_npu_model(*args, **optimize_kwargs) else: from ipex_llm.transformers.npu_models.convert import optimize_llm - optimize_llm(model) + # optimize_llm(model) with torch.no_grad(): cls.load_convert(qtype, model, "cpu", modules_to_not_convert, quantization_group_size, imatrix_data, diff --git a/python/llm/src/ipex_llm/transformers/npu_models/linear.py b/python/llm/src/ipex_llm/transformers/npu_models/linear.py index 0b636b098d6..ff9eb5e31b3 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/linear.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/linear.py @@ -201,7 +201,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: ) ) - out = run_matmul(x, self.weight.data, self.scale.data, self.op_id) + min_data = self.min.data if self.min is not None else None + out = run_matmul(x, self.weight.data, self.scale.data, min_data, self.op_id) if self.qtype == "asym_int4_rtn" and self.min is not None: out = out + self.min @@ -245,15 +246,16 @@ def __init__( ) if weight.dtype == torch.uint8: - if qtype == "sym_int4_rtn": - weight = weight.view(torch.int8) + weight = weight.view(torch.int8) high_4bits = weight >> 4 low_4bits = (weight << 4) >> 4 combined_weight = torch.cat((low_4bits.unsqueeze(2), high_4bits.unsqueeze(2)), dim=2) decompressed_weight = combined_weight.view(combined_weight.size(0), -1) - dequantized_weight = decompressed_weight.to(torch.float32) * \ - torch.unsqueeze(scale.to(torch.float32), dim=1) + if qtype == "asym_int4_rtn": + decompressed_weight = decompressed_weight + 8 + dequantized_weight = decompressed_weight.to(torch.float32) + dequantized_weight = dequantized_weight * torch.unsqueeze(scale.to(torch.float32), dim=1) if qtype == "asym_int4_rtn" and min is not None: dequantized_weight = dequantized_weight + torch.unsqueeze(min.to(torch.float32), dim=1) From 23f902312a73e3ac3565a8c93e2a7de2caa5500a Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Wed, 4 Dec 2024 11:44:27 +0800 Subject: [PATCH 05/21] update min to Z1 --- python/llm/src/ipex_llm/transformers/npu_models/linear.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_models/linear.py b/python/llm/src/ipex_llm/transformers/npu_models/linear.py index ff9eb5e31b3..22d64b005ca 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/linear.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/linear.py @@ -252,8 +252,6 @@ def __init__( combined_weight = torch.cat((low_4bits.unsqueeze(2), high_4bits.unsqueeze(2)), dim=2) decompressed_weight = combined_weight.view(combined_weight.size(0), -1) - if qtype == "asym_int4_rtn": - decompressed_weight = decompressed_weight + 8 dequantized_weight = decompressed_weight.to(torch.float32) dequantized_weight = dequantized_weight * torch.unsqueeze(scale.to(torch.float32), dim=1) if qtype == "asym_int4_rtn" and min is not None: From 293fb8386a89a3d766829f49dfaf651f48dab419 Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Wed, 4 Dec 2024 14:16:46 +0800 Subject: [PATCH 06/21] update --- .../llm/src/ipex_llm/transformers/npu_models/convert.py | 5 ++--- .../llm/src/ipex_llm/transformers/npu_models/linear.py | 9 +++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert.py b/python/llm/src/ipex_llm/transformers/npu_models/convert.py index 8fd0704b194..cc641fbc3b2 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/convert.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/convert.py @@ -88,10 +88,9 @@ def replace_with_QuantizedLinear(layer, qtype, device, modules_to_not_convert, from ipex_llm.ggml.quantize import ggml_tensor_qtype iqtype = ggml_tensor_qtype[qtype] if isinstance(layer, torch.nn.Linear) and not hasattr(layer, "qtype"): - if qtype == "sym_int4_rtn": + if qtype in ["sym_int4_rtn", "asym_int4_rtn"]: # workaround for qwen2-7B & int4 - if (layer.in_features == 3584 and layer.out_features == 152064) or \ - (layer.in_features == 18944 and layer.out_features == 3584): + if (layer.in_features == 3584 and layer.out_features == 152064): qtype = "sym_int8_rtn" iqtype = ggml_tensor_qtype[qtype] enable_scale_search = os.environ.get("IPEX_LLM_NPU_QUANTIZATION_OPT", "0") != "0" diff --git a/python/llm/src/ipex_llm/transformers/npu_models/linear.py b/python/llm/src/ipex_llm/transformers/npu_models/linear.py index 22d64b005ca..eaa1cd7c718 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/linear.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/linear.py @@ -166,8 +166,11 @@ def __init__( # Int4 we need to double the input channels because weights are compressed self.inC *= 2 self.scale = Parameter(scale * math.sqrt(self.inC), requires_grad=False) + if min is not None: + self.min = Parameter(min * math.sqrt(self.inC), requires_grad=False) + else: + self.min = None self.bias = bias - self.min = min self.qtype = qtype self.op_id = str(uuid.uuid4()) @@ -202,11 +205,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: ) min_data = self.min.data if self.min is not None else None + print("min is None:", min is None) out = run_matmul(x, self.weight.data, self.scale.data, min_data, self.op_id) - if self.qtype == "asym_int4_rtn" and self.min is not None: - out = out + self.min - if self.bias is None: return out return out + self.bias From 3e799d59eccc76870fe447f49e5342d9fbd9c14c Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Wed, 4 Dec 2024 15:15:39 +0800 Subject: [PATCH 07/21] fix --- python/llm/src/ipex_llm/transformers/npu_models/linear.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_models/linear.py b/python/llm/src/ipex_llm/transformers/npu_models/linear.py index eaa1cd7c718..bbb006aa48d 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/linear.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/linear.py @@ -205,7 +205,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: ) min_data = self.min.data if self.min is not None else None - print("min is None:", min is None) out = run_matmul(x, self.weight.data, self.scale.data, min_data, self.op_id) if self.bias is None: From 17fbe84944eafd8b52e652bb676af51c8f2427aa Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Wed, 4 Dec 2024 15:44:25 +0800 Subject: [PATCH 08/21] update --- python/llm/src/ipex_llm/transformers/npu_model.py | 7 ++++++- python/llm/src/ipex_llm/transformers/npu_models/linear.py | 4 ++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index 21afc14ca90..4bdc5eb4aab 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -155,6 +155,11 @@ def from_pretrained(cls, *args, **kwargs): f"but got {quantization_group_size}" ) ) + + if low_bit == "asym_int4": + invalidInputError(quantization_group_size > 0, + "asym_int4 only support quantization_group_size == 0 for now.") + _args = copy.deepcopy(args) _kwargs = copy.deepcopy(kwargs) @@ -221,7 +226,7 @@ def from_pretrained(cls, *args, **kwargs): model = cls.optimize_npu_model(*args, **optimize_kwargs) else: from ipex_llm.transformers.npu_models.convert import optimize_llm - # optimize_llm(model) + optimize_llm(model) with torch.no_grad(): cls.load_convert(qtype, model, "cpu", modules_to_not_convert, quantization_group_size, imatrix_data, diff --git a/python/llm/src/ipex_llm/transformers/npu_models/linear.py b/python/llm/src/ipex_llm/transformers/npu_models/linear.py index bbb006aa48d..4decce2893b 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/linear.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/linear.py @@ -252,8 +252,8 @@ def __init__( combined_weight = torch.cat((low_4bits.unsqueeze(2), high_4bits.unsqueeze(2)), dim=2) decompressed_weight = combined_weight.view(combined_weight.size(0), -1) - dequantized_weight = decompressed_weight.to(torch.float32) - dequantized_weight = dequantized_weight * torch.unsqueeze(scale.to(torch.float32), dim=1) + dequantized_weight = decompressed_weight.to(torch.float32) * \ + torch.unsqueeze(scale.to(torch.float32), dim=1) if qtype == "asym_int4_rtn" and min is not None: dequantized_weight = dequantized_weight + torch.unsqueeze(min.to(torch.float32), dim=1) From c5e14ad352f8e1be34a6d3fc4396dc5689a1dc7f Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Wed, 4 Dec 2024 15:45:51 +0800 Subject: [PATCH 09/21] fix style --- python/llm/src/ipex_llm/transformers/npu_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index 4bdc5eb4aab..b08b0352a98 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -155,7 +155,7 @@ def from_pretrained(cls, *args, **kwargs): f"but got {quantization_group_size}" ) ) - + if low_bit == "asym_int4": invalidInputError(quantization_group_size > 0, "asym_int4 only support quantization_group_size == 0 for now.") From 59dfc424b03a15437154835a5f0b9d2134a402ae Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Wed, 4 Dec 2024 16:01:37 +0800 Subject: [PATCH 10/21] fix --- python/llm/src/ipex_llm/transformers/npu_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index b08b0352a98..2a56de7df6d 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -157,7 +157,7 @@ def from_pretrained(cls, *args, **kwargs): ) if low_bit == "asym_int4": - invalidInputError(quantization_group_size > 0, + invalidInputError(quantization_group_size == 0, "asym_int4 only support quantization_group_size == 0 for now.") _args = copy.deepcopy(args) From 1cc8b96a6fbaa270fdc273be828fb30896134861 Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Wed, 4 Dec 2024 18:07:27 +0800 Subject: [PATCH 11/21] support qwen2 optimize_model=True mp version --- .../src/ipex_llm/transformers/npu_model.py | 51 ++++++++++--------- .../transformers/npu_models/convert_mp.py | 2 +- .../transformers/npu_models/mp_models_base.py | 42 ++++++++++----- .../transformers/npu_models/qwen2_mp.py | 44 ++++++++++++---- 4 files changed, 91 insertions(+), 48 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index 2a56de7df6d..7f86f6e45d5 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -276,6 +276,7 @@ def optimize_npu_model(cls, *args, **kwargs): with torch.no_grad(): model.config.update({"mixed_precision": mixed_precision}) model.config.update({"group_size": quantization_group_size}) + model.config.update({"asym": qtype == "asym_int4_rtn"}) optimize_llm_pre(model, qtype, mixed_precision, quantization_group_size=quantization_group_size) cls.load_convert(qtype, model, "cpu", modules_to_not_convert, @@ -288,29 +289,29 @@ def optimize_npu_model(cls, *args, **kwargs): model.share_memory() if not pipeline: - if (not hasattr(model, 'llm') and - model.config.model_type in ["qwen2", "llama", "minicpm"]): - from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process - optimize_llm_single_process( - llm, - kv_len=max_context_len, - max_prompt_len=max_prompt_len, - transpose_value_cache=transpose_value_cache, - group_size=quantization_group_size, - qtype=qtype, - save_directory=save_directory, - fuse_layers=fuse_layers - ) - else: - optimize_llm( - llm, - max_context_len=max_context_len, - max_prompt_len=max_prompt_len, - inter_pp=inter_pp, - intra_pp=intra_pp, - transpose_value_cache=transpose_value_cache, - group_size=quantization_group_size - ) + # if (not hasattr(model, 'llm') and + # model.config.model_type in ["qwen2", "llama", "minicpm"]): + # from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process + # optimize_llm_single_process( + # llm, + # kv_len=max_context_len, + # max_prompt_len=max_prompt_len, + # transpose_value_cache=transpose_value_cache, + # group_size=quantization_group_size, + # qtype=qtype, + # save_directory=save_directory, + # fuse_layers=fuse_layers + # ) + # else: + optimize_llm( + llm, + max_context_len=max_context_len, + max_prompt_len=max_prompt_len, + inter_pp=inter_pp, + intra_pp=intra_pp, + transpose_value_cache=transpose_value_cache, + group_size=quantization_group_size + ) else: from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \ import convert_llm @@ -422,9 +423,9 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs) ) invalidInputError( - qtype in ["sym_int8_rtn", "sym_int4_rtn"], + qtype in ["sym_int8_rtn", "sym_int4_rtn", "asym_int4_rtn"], f"Unknown bigdl_transformers_low_bit value: {qtype}," - f" expected: sym_int8_rtn, sym_int4_rtn. " + f" expected: sym_int8_rtn, sym_int4_rtn, asym_int4_rtn. " ) if enable_cpp_backend: diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py index 39c9cd00fe6..0d1089963c9 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py @@ -128,7 +128,7 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision, from ipex_llm.transformers.npu_models.common import split_linears if quantization_group_size == 0: n_splits_linear = 1 - if qtype == "sym_int8_rtn": + if qtype in ["sym_int8_rtn", "asym_int4_rtn"]: # do not split mlp down_proj for Qwen2-7B & sym_int8 n_splits_down_proj = 1 else: diff --git a/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py b/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py index ccf6e242d90..59cc2b1920e 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py @@ -59,9 +59,15 @@ def run_model( op_args_flatten = [] for w in weights: if isinstance(w, tuple): # from QuantizedLinear - op_args.append((set_contiguous(w[0]).numpy(), set_contiguous(w[1]).numpy())) - op_args_flatten.append(op_args[-1][0]) - op_args_flatten.append(op_args[-1][1]) + if len(w) == 2: + op_args.append((set_contiguous(w[0]).numpy(), set_contiguous(w[1]).numpy())) + op_args_flatten.append(op_args[-1][0]) + op_args_flatten.append(op_args[-1][1]) + else: + op_args.append((set_contiguous(w[0]).numpy(), set_contiguous(w[1]).numpy(), set_contiguous(w[2]).numpy())) + op_args_flatten.append(op_args[-1][0]) + op_args_flatten.append(op_args[-1][1]) + op_args_flatten.append(op_args[-1][2]) elif w.dtype in [torch.int8, torch.uint8]: # QuantizedLinear weight op_args.append(w.numpy()) op_args_flatten.append(op_args[-1]) @@ -104,7 +110,7 @@ def run_model( class LLMBaseNNFactory(NNFactory): def __init__(self, max_seq_len, transpose_value, dtype, profile=False, device="NPU", - n_splits_linear=1, n_splits_down_proj=1, group_size=0): + n_splits_linear=1, n_splits_down_proj=1, group_size=0, asym=False): super().__init__(profile, device) self.cache_parameter_ops = [] self.input_ops = [] @@ -117,6 +123,7 @@ def __init__(self, max_seq_len, transpose_value, dtype, profile=False, device="N self.n_splits_linear = n_splits_linear self.n_splits_down_proj = n_splits_down_proj self.group_size = group_size + self.asym = asym def attention(self, *, @@ -149,7 +156,8 @@ def attention(self, wt_dtype=self.dtype, n_splits=self.n_splits_linear, scale_factor=(self.group_size == 0), - is_prefill=(mode == "prefill") + is_prefill=(mode == "prefill"), + asym=self.asym ) key_states = self.linear( @@ -160,7 +168,8 @@ def attention(self, wt_dtype=self.dtype, n_splits=self.n_splits_linear, scale_factor=(self.group_size == 0), - is_prefill=(mode == "prefill") + is_prefill=(mode == "prefill"), + asym=self.asym ) value_states = self.linear( @@ -171,7 +180,8 @@ def attention(self, wt_dtype=self.dtype, n_splits=self.n_splits_linear, scale_factor=(self.group_size == 0), - is_prefill=(mode == "prefill") + is_prefill=(mode == "prefill"), + asym=self.asym ) if q_bias is not None: @@ -260,7 +270,8 @@ def attention(self, attn_output, hidden_size, hidden_size, bias=False, wt_dtype=self.dtype, n_splits=self.n_splits_linear, scale_factor=(self.group_size == 0), - is_prefill=(mode == "prefill") + is_prefill=(mode == "prefill"), + asym=self.asym ) return attn_output, new_key_states, new_value_states @@ -428,13 +439,15 @@ def mlp(self, hidden_states, seq_len=-1, mode="prefill"): hidden_states, self.intermediate_size, self.hidden_size, bias=False, wt_dtype=self.dtype, n_splits=self.n_splits_linear, scale_factor=(self.group_size == 0), - is_prefill=(mode == "prefill") + is_prefill=(mode == "prefill"), + asym=self.asym ) mm2 = self.linear( hidden_states, self.intermediate_size, self.hidden_size, bias=False, wt_dtype=self.dtype, n_splits=self.n_splits_linear, scale_factor=(self.group_size == 0), - is_prefill=(mode == "prefill") + is_prefill=(mode == "prefill"), + asym=self.asym ) # type: ignore[attr-defined] mm1 = self.eltwise_mul(self.swish(mm1), mm2) # type: ignore[attr-defined] @@ -442,7 +455,8 @@ def mlp(self, hidden_states, seq_len=-1, mode="prefill"): mm1, self.hidden_size, self.intermediate_size, bias=False, wt_dtype=self.dtype, n_splits=self.n_splits_down_proj, scale_factor=(self.group_size == 0), - is_prefill=(mode == "prefill") + is_prefill=(mode == "prefill"), + asym=self.asym ) return hidden_states @@ -558,11 +572,13 @@ def linear(self, wt_dtype: npt.DTypeLike = np.float16, n_splits: int = 1, scale_factor: bool = True, - is_prefill: bool = False): + is_prefill: bool = False, + asym: bool = False): if n_splits == 1: op = super().linear(input_node, output_channels, input_channels, bias, act_dtype, - wt_dtype, scale_factor=scale_factor) + wt_dtype, scale_factor=scale_factor, + asym=asym) else: op = super().dq_split_linear(input_node, n_splits, output_channels, input_channels, diff --git a/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py index 015efe10031..d32f1751a98 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py @@ -97,7 +97,8 @@ def __init__( intermediate_size, n_splits_linear: int = 1, n_splits_down_proj: int = 1, - group_size: int = 0 + group_size: int = 0, + asym: bool = False, ): super().__init__(max_seq_len=max_seq_len, transpose_value=transpose_value, @@ -106,7 +107,8 @@ def __init__( device=device, n_splits_linear=n_splits_linear, n_splits_down_proj=n_splits_down_proj, - group_size=group_size) + group_size=group_size, + asym=asym) self.max_seq_len = max_seq_len self.intermediate_size = intermediate_size self.dtype = dtype @@ -311,6 +313,7 @@ def __init__( n_splits_linear: int = 1, n_splits_down_proj: int = 1, group_size: int = 0, + asym: bool = False, ): super().__init__() @@ -318,8 +321,10 @@ def __init__( op_parameters = [] for w in parameters: - if isinstance(w, tuple): # from QuantizedLinear + if isinstance(w, tuple) and not asym: # from QuantizedLinear op_parameters.append((w[0].numpy(), w[1].numpy())) + elif isinstance(w, tuple) and asym: # from QuantizedLinear + op_parameters.append((w[0].numpy(), w[1].numpy(), w[2].numpy())) elif w.dtype in [torch.int8, torch.uint8]: # QuantizedLinear weight op_parameters.append(w.numpy()) elif isinstance(w, np.ndarray): # scale @@ -375,7 +380,8 @@ def __init__( dtype=np_dtype, n_splits_linear=n_splits_linear, n_splits_down_proj=n_splits_down_proj, - group_size=group_size + group_size=group_size, + asym=asym, ) self.backend_decoders.append(decoder) @@ -461,6 +467,7 @@ def __init__( n_splits_linear: int = 1, n_splits_down_proj: int = 1, group_size: int = 0, + asym: bool = False, ): super().__init__() self.op_parameters = parameters @@ -491,7 +498,8 @@ def __init__( dtype=np_dtype, n_splits_linear=n_splits_linear, n_splits_down_proj=n_splits_down_proj, - group_size=group_size + group_size=group_size, + asym=asym ) self.layer_norm_0 = layer_norm_0 self.layer_norm_1 = layer_norm_1 @@ -580,6 +588,7 @@ def run_decode( layer_indexs = range(layer_start, layer_end) n_splits_linear = len(model.model.layers[0].mlp.gate_proj_dq_list) n_splits_down_proj = len(model.model.layers[0].mlp.down_proj_dq_list) + asym = getattr(model.config, "asym", False) for layer_idx in layer_indexs: curr_layer = model.model.layers[layer_idx] attn_layer = curr_layer.self_attn @@ -592,10 +601,17 @@ def run_decode( mlp_layer.down_proj_dq_list]: l_weights = [] scales = [] + mins = [] for l in layer_list: l_weights.append(l.weight) scales.append(l.scale) - weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0))) + if l.min is not None: + mins.append(l.min) + if len(mins): + weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0), + torch.stack(mins, axis=0))) + else: + weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0))) cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16) cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16) @@ -630,7 +646,8 @@ def run_decode( do_print=False, n_splits_linear=n_splits_linear, n_splits_down_proj=n_splits_down_proj, - group_size=group_size + group_size=group_size, + asym=asym ) dist.barrier() @@ -809,6 +826,7 @@ def run_prefill( layer_indexs = range(layer_start, layer_end) n_splits_linear = len(model.model.layers[0].mlp.gate_proj_dq_list) n_splits_down_proj = len(model.model.layers[0].mlp.down_proj_dq_list) + asym = getattr(model.config, "asym", False) for layer_idx in layer_indexs: curr_layer = model.model.layers[layer_idx] attn_layer = curr_layer.self_attn @@ -821,10 +839,17 @@ def run_prefill( mlp_layer.down_proj_dq_list]: l_weights = [] scales = [] + mins = [] for l in layer_list: l_weights.append(l.weight) scales.append(l.scale) - weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0))) + if l.min is not None: + mins.append(l.min) + if len(mins): + weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0), + torch.stack(mins, axis=0))) + else: + weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0))) cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16) cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16) @@ -850,7 +875,8 @@ def run_prefill( transpose_value=transpose_value_cache, n_splits_linear=n_splits_linear, n_splits_down_proj=n_splits_down_proj, - group_size=group_size + group_size=group_size, + asym=asym ) layer_weights.extend(weights) From 5f02aacb6146f836f62a663bd73af4e070bbc566 Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Wed, 4 Dec 2024 18:57:12 +0800 Subject: [PATCH 12/21] temp save --- .../src/ipex_llm/transformers/npu_model.py | 46 ++++++------ .../npu_pipeline_model/convert_pipeline.py | 6 +- .../transformers/npu_pipeline_model/qwen.py | 75 ++++++++++++++----- 3 files changed, 85 insertions(+), 42 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index 7f86f6e45d5..657daa0aa3e 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -289,29 +289,29 @@ def optimize_npu_model(cls, *args, **kwargs): model.share_memory() if not pipeline: - # if (not hasattr(model, 'llm') and - # model.config.model_type in ["qwen2", "llama", "minicpm"]): - # from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process - # optimize_llm_single_process( - # llm, - # kv_len=max_context_len, - # max_prompt_len=max_prompt_len, - # transpose_value_cache=transpose_value_cache, - # group_size=quantization_group_size, - # qtype=qtype, - # save_directory=save_directory, - # fuse_layers=fuse_layers - # ) - # else: - optimize_llm( - llm, - max_context_len=max_context_len, - max_prompt_len=max_prompt_len, - inter_pp=inter_pp, - intra_pp=intra_pp, - transpose_value_cache=transpose_value_cache, - group_size=quantization_group_size - ) + if (not hasattr(model, 'llm') and + model.config.model_type in ["qwen2", "llama", "minicpm"]): + from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process + optimize_llm_single_process( + llm, + kv_len=max_context_len, + max_prompt_len=max_prompt_len, + transpose_value_cache=transpose_value_cache, + group_size=quantization_group_size, + qtype=qtype, + save_directory=save_directory, + fuse_layers=fuse_layers + ) + else: + optimize_llm( + llm, + max_context_len=max_context_len, + max_prompt_len=max_prompt_len, + inter_pp=inter_pp, + intra_pp=intra_pp, + transpose_value_cache=transpose_value_cache, + group_size=quantization_group_size + ) else: from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \ import convert_llm diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py index 2e6b249c1a5..5304d124539 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py @@ -201,7 +201,7 @@ def convert_llm(model: torch.nn.Module, layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1" if group_size == 0: n_splits_linear = 1 - if qtype == "sym_int8_rtn": + if qtype in ["sym_int8_rtn", "asym_int4_rtn"]: # do not split mlp down_proj for Qwen2-7B & sym_int8 n_splits_down_proj = 1 else: @@ -433,6 +433,7 @@ def convert_llm_for_deploy(model: torch.nn.Module, if not os.path.exists(weight_dir): os.mkdir(weight_dir) layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1" + asym = getattr(model.config, "asym", False) if model.config.model_type == "qwen2": if group_size == 0: @@ -456,7 +457,8 @@ def convert_llm_for_deploy(model: torch.nn.Module, "weight_num": 7, "weight_idx": 8, "n_splits_linear": n_splits_linear, - "n_splits_down_proj": n_splits_down_proj} + "n_splits_down_proj": n_splits_down_proj, + "asym": asym} model.config.update(update_dict) model.config.save_pretrained(save_directory) diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py index e4b318244ce..40d2ebf135b 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py @@ -104,6 +104,7 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, head_dim = model.model.layers[0].self_attn.head_dim intermediate_size = model.config.intermediate_size rms_norm_eps = model.config.rms_norm_eps + asym = getattr(model.config, "asym", False) from ipex_llm.transformers.npu_models.qwen2_mp import LowBitQwenMultiDecoderlayer curr_layer = model.model.layers[layer_idx] @@ -117,10 +118,17 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, mlp_layer.down_proj_dq_list]: l_weights = [] scales = [] + mins = [] for l in layer_list: l_weights.append(l.weight) scales.append(l.scale) - weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0))) + if l.min is not None: + mins.append(l.min) + if len(mins): + weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0), + torch.stack(mins, axis=0))) + else: + weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0))) q_bias = attn_layer.q_proj_dq_list.q_proj_dq_0.bias.to(torch.float16) k_bias = attn_layer.k_proj_dq_list.k_proj_dq_0.bias.to(torch.float16) @@ -164,7 +172,8 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, dtype=np_dtype, n_splits_linear=n_splits_linear, n_splits_down_proj=n_splits_down_proj, - group_size=group_size + group_size=group_size, + asym=asym ) rest_blob_path = update_names_of_IR_and_export_blob(single_decoder, decoder_name, @@ -188,11 +197,23 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, k_bias.data.numpy().tofile(k_bias_bin_file) v_bias.data.numpy().tofile(v_bias_bin_file) # 6, 7 are past k/v - for idx, (weight, scale) in enumerate(weights): - bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+5+idx*2}.bin") - weight.numpy().tofile(bin_file) - bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+5+idx*2+1}.bin") - scale.numpy().tofile(bin_file) + if not asym: + for idx, (weight, scale) in enumerate(weights): + bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*2}.bin") + weight.numpy().tofile(bin_file) + bin_file = os.path.join(weight_dir, + f"model_{layer_idx}_input_{st_idx+3+idx*2+1}.bin") + scale.numpy().tofile(bin_file) + else: + for idx, (weight, scale, min) in enumerate(weights): + bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*3}.bin") + weight.numpy().tofile(bin_file) + bin_file = os.path.join(weight_dir, + f"model_{layer_idx}_input_{st_idx+3+idx*3+1}.bin") + scale.numpy().tofile(bin_file) + bin_file = os.path.join(weight_dir, + f"model_{layer_idx}_input_{st_idx+3+idx*3+2}.bin") + min.numpy().tofile(bin_file) del single_decoder @@ -207,6 +228,7 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down rms_norm_eps = model.config.rms_norm_eps layer_num = len(model.model.layers) fused_layer_num = layer_num // fused_layers + asym = getattr(model.config, "asym", False) from ipex_llm.transformers.npu_models.qwen2_mp import LowBitQwenMultiDecoderlayer for i in range(fused_layers): @@ -228,15 +250,22 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down weights = [] for layer_list in [attn_layer.q_proj_dq_list, attn_layer.k_proj_dq_list, - attn_layer.v_proj_dq_list, attn_layer.o_proj_dq_list, - mlp_layer.gate_proj_dq_list, mlp_layer.up_proj_dq_list, - mlp_layer.down_proj_dq_list]: + attn_layer.v_proj_dq_list, attn_layer.o_proj_dq_list, + mlp_layer.gate_proj_dq_list, mlp_layer.up_proj_dq_list, + mlp_layer.down_proj_dq_list]: l_weights = [] scales = [] + mins = [] for l in layer_list: l_weights.append(l.weight) scales.append(l.scale) - weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0))) + if l.min is not None: + mins.append(l.min) + if len(mins): + weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0), + torch.stack(mins, axis=0))) + else: + weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0))) cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16) cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16) @@ -264,12 +293,23 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down k_biases[-1].data.numpy().tofile(k_bias_bin_file) v_biases[-1].data.numpy().tofile(v_bias_bin_file) # 6, 7 are past k/v - for idx, (weight, scale) in enumerate(weights): - bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*2}.bin") - weight.numpy().tofile(bin_file) - bin_file = os.path.join(weight_dir, - f"model_{layer_idx}_input_{st_idx+3+idx*2+1}.bin") - scale.numpy().tofile(bin_file) + if not asym: + for idx, (weight, scale) in enumerate(weights): + bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*2}.bin") + weight.numpy().tofile(bin_file) + bin_file = os.path.join(weight_dir, + f"model_{layer_idx}_input_{st_idx+3+idx*2+1}.bin") + scale.numpy().tofile(bin_file) + else: + for idx, (weight, scale, min) in enumerate(weights): + bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*3}.bin") + weight.numpy().tofile(bin_file) + bin_file = os.path.join(weight_dir, + f"model_{layer_idx}_input_{st_idx+3+idx*3+1}.bin") + scale.numpy().tofile(bin_file) + bin_file = os.path.join(weight_dir, + f"model_{layer_idx}_input_{st_idx+3+idx*3+2}.bin") + min.numpy().tofile(bin_file) if isinstance(weights[0], tuple): np_dtype = np.int8 if weights[0][0].dtype == torch.int8 else np.uint8 @@ -297,6 +337,7 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down n_splits_linear=n_splits_linear, n_splits_down_proj=n_splits_down_proj, group_size=group_size + asym=asym ) update_names_of_IR_and_export_blob(fused_decoder, f"decoder_layer_{i}", From dc1ec36e425482946f13cb6a33bcb41869d5519d Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Wed, 4 Dec 2024 19:46:59 +0800 Subject: [PATCH 13/21] fix --- .../npu_pipeline_model/convert_pipeline.py | 4 +--- .../ipex_llm/transformers/npu_pipeline_model/qwen.py | 10 +++++----- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py index 5304d124539..ee773614b63 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py @@ -433,7 +433,6 @@ def convert_llm_for_deploy(model: torch.nn.Module, if not os.path.exists(weight_dir): os.mkdir(weight_dir) layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1" - asym = getattr(model.config, "asym", False) if model.config.model_type == "qwen2": if group_size == 0: @@ -457,8 +456,7 @@ def convert_llm_for_deploy(model: torch.nn.Module, "weight_num": 7, "weight_idx": 8, "n_splits_linear": n_splits_linear, - "n_splits_down_proj": n_splits_down_proj, - "asym": asym} + "n_splits_down_proj": n_splits_down_proj} model.config.update(update_dict) model.config.save_pretrained(save_directory) diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py index 40d2ebf135b..40443ace85d 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py @@ -205,7 +205,7 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, f"model_{layer_idx}_input_{st_idx+3+idx*2+1}.bin") scale.numpy().tofile(bin_file) else: - for idx, (weight, scale, min) in enumerate(weights): + for idx, (weight, scale, m) in enumerate(weights): bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*3}.bin") weight.numpy().tofile(bin_file) bin_file = os.path.join(weight_dir, @@ -213,7 +213,7 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, scale.numpy().tofile(bin_file) bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*3+2}.bin") - min.numpy().tofile(bin_file) + m.numpy().tofile(bin_file) del single_decoder @@ -301,7 +301,7 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down f"model_{layer_idx}_input_{st_idx+3+idx*2+1}.bin") scale.numpy().tofile(bin_file) else: - for idx, (weight, scale, min) in enumerate(weights): + for idx, (weight, scale, m) in enumerate(weights): bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*3}.bin") weight.numpy().tofile(bin_file) bin_file = os.path.join(weight_dir, @@ -309,7 +309,7 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down scale.numpy().tofile(bin_file) bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*3+2}.bin") - min.numpy().tofile(bin_file) + m.numpy().tofile(bin_file) if isinstance(weights[0], tuple): np_dtype = np.int8 if weights[0][0].dtype == torch.int8 else np.uint8 @@ -336,7 +336,7 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down dtype=np_dtype, n_splits_linear=n_splits_linear, n_splits_down_proj=n_splits_down_proj, - group_size=group_size + group_size=group_size, asym=asym ) update_names_of_IR_and_export_blob(fused_decoder, From f54afdc41b72633576162775603387d2b43580a8 Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Wed, 4 Dec 2024 20:01:00 +0800 Subject: [PATCH 14/21] fix style --- .../transformers/npu_models/mp_models_base.py | 3 ++- .../ipex_llm/transformers/npu_pipeline_model/qwen.py | 12 +++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py b/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py index 59cc2b1920e..3b9ef4d2de4 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py @@ -64,7 +64,8 @@ def run_model( op_args_flatten.append(op_args[-1][0]) op_args_flatten.append(op_args[-1][1]) else: - op_args.append((set_contiguous(w[0]).numpy(), set_contiguous(w[1]).numpy(), set_contiguous(w[2]).numpy())) + op_args.append((set_contiguous(w[0]).numpy(), set_contiguous(w[1]).numpy(), + set_contiguous(w[2]).numpy())) op_args_flatten.append(op_args[-1][0]) op_args_flatten.append(op_args[-1][1]) op_args_flatten.append(op_args[-1][2]) diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py index 40443ace85d..28f4cb8dc4d 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py @@ -250,9 +250,9 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down weights = [] for layer_list in [attn_layer.q_proj_dq_list, attn_layer.k_proj_dq_list, - attn_layer.v_proj_dq_list, attn_layer.o_proj_dq_list, - mlp_layer.gate_proj_dq_list, mlp_layer.up_proj_dq_list, - mlp_layer.down_proj_dq_list]: + attn_layer.v_proj_dq_list, attn_layer.o_proj_dq_list, + mlp_layer.gate_proj_dq_list, mlp_layer.up_proj_dq_list, + mlp_layer.down_proj_dq_list]: l_weights = [] scales = [] mins = [] @@ -295,14 +295,16 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down # 6, 7 are past k/v if not asym: for idx, (weight, scale) in enumerate(weights): - bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*2}.bin") + bin_file = os.path.join(weight_dir, + f"model_{layer_idx}_input_{st_idx+3+idx*2}.bin") weight.numpy().tofile(bin_file) bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*2+1}.bin") scale.numpy().tofile(bin_file) else: for idx, (weight, scale, m) in enumerate(weights): - bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*3}.bin") + bin_file = os.path.join(weight_dir, + f"model_{layer_idx}_input_{st_idx+3+idx*3}.bin") weight.numpy().tofile(bin_file) bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*3+1}.bin") From 0450c7b2d846e0458a60b80338c1532bd1fb4829 Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Thu, 5 Dec 2024 10:29:56 +0800 Subject: [PATCH 15/21] replace min with zero --- .../transformers/npu_models/convert.py | 16 +++++------ .../transformers/npu_models/linear.py | 22 +++++++-------- .../transformers/npu_models/qwen2_mp.py | 20 ++++++------- .../transformers/npu_pipeline_model/qwen.py | 28 +++++++++---------- 4 files changed, 43 insertions(+), 43 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert.py b/python/llm/src/ipex_llm/transformers/npu_models/convert.py index cc641fbc3b2..4aa45b0054a 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/convert.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/convert.py @@ -98,11 +98,11 @@ def replace_with_QuantizedLinear(layer, qtype, device, modules_to_not_convert, iqtype, device=device, enable_scale_search=enable_scale_search, imatrix=imatrix) - min = None - # split scale to scale & min + zero = None + # split scale to scale & zero if qtype == "asym_int4_rtn": - scale, min = torch.split(scale, scale.shape[0] // 2) - return QuantizedLinear(qweights, scale, min, layer.bias, + scale, zero = torch.split(scale, scale.shape[0] // 2) + return QuantizedLinear(qweights, scale, zero, layer.bias, group_size=group_size, qtype=qtype) @@ -124,11 +124,11 @@ def replace_with_DequantizedLinear(layer, qtype, device, modules_to_not_convert, iqtype, device=device, enable_scale_search=enable_scale_search, imatrix=imatrix) - min = None - # split scale to scale & min + zero = None + # split scale to scale & zero if qtype == "asym_int4_rtn": - scale, min = torch.split(scale, scale.shape[0] // 2) - return DequantizedLinear(qweights, scale, min, layer.bias, qtype) + scale, zero = torch.split(scale, scale.shape[0] // 2) + return DequantizedLinear(qweights, scale, zero, layer.bias, qtype) @module_optimization diff --git a/python/llm/src/ipex_llm/transformers/npu_models/linear.py b/python/llm/src/ipex_llm/transformers/npu_models/linear.py index 4decce2893b..eb9ae98c75e 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/linear.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/linear.py @@ -129,7 +129,7 @@ def __init__( self, weight: torch.Tensor, scale: torch.Tensor, - min: Optional[torch.Tensor] = None, + zero: Optional[torch.Tensor] = None, bias: Optional[torch.Tensor] = None, qtype: Optional[str] = "sym_int4_rtn", group_size: int = 0, @@ -139,7 +139,7 @@ def __init__( Args: weight (torch.Tensor): Linear operation weight scale (torch.Tensor): Quantization scale - min (Optional[torch.Tensor], optional): Quantization min for asym_int4_rtn + zero (Optional[torch.Tensor], optional): Quantization zero for asym_int4_rtn bias (Optional[torch.Tensor], optional): Linear operation optional bias. Defaults to None. qtype (Optional[str], optional): qtype of this Linear @@ -166,10 +166,10 @@ def __init__( # Int4 we need to double the input channels because weights are compressed self.inC *= 2 self.scale = Parameter(scale * math.sqrt(self.inC), requires_grad=False) - if min is not None: - self.min = Parameter(min * math.sqrt(self.inC), requires_grad=False) + if zero is not None: + self.zero = Parameter(zero * math.sqrt(self.inC), requires_grad=False) else: - self.min = None + self.zero = None self.bias = bias self.qtype = qtype self.op_id = str(uuid.uuid4()) @@ -204,8 +204,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: ) ) - min_data = self.min.data if self.min is not None else None - out = run_matmul(x, self.weight.data, self.scale.data, min_data, self.op_id) + zero_data = self.zero.data if self.zero is not None else None + out = run_matmul(x, self.weight.data, self.scale.data, zero_data, self.op_id) if self.bias is None: return out @@ -219,7 +219,7 @@ def __init__( self, weight: torch.Tensor, scale: torch.Tensor, - min: Optional[torch.Tensor] = None, + zero: Optional[torch.Tensor] = None, bias: Optional[torch.Tensor] = None, qtype: Optional[str] = "sym_int4_rtn", ): @@ -227,7 +227,7 @@ def __init__( Args: weight (torch.Tensor): Linear operation quantized weight scale (torch.Tensor): Quantization scale - min (Optional[torch.Tensor], optional): Quantization min for asym_int4_rtn + zero (Optional[torch.Tensor], optional): Quantization zero for asym_int4_rtn bias (Optional[torch.Tensor], optional): Linear operation optional bias. Defaults to None. qtype (Optional[str], optional): qtype of this Linear @@ -254,8 +254,8 @@ def __init__( decompressed_weight = combined_weight.view(combined_weight.size(0), -1) dequantized_weight = decompressed_weight.to(torch.float32) * \ torch.unsqueeze(scale.to(torch.float32), dim=1) - if qtype == "asym_int4_rtn" and min is not None: - dequantized_weight = dequantized_weight + torch.unsqueeze(min.to(torch.float32), + if qtype == "asym_int4_rtn" and zero is not None: + dequantized_weight = dequantized_weight + torch.unsqueeze(zero.to(torch.float32), dim=1) self.weight = Parameter(dequantized_weight, requires_grad=False).contiguous() else: diff --git a/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py index d32f1751a98..397739cb72a 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py @@ -601,15 +601,15 @@ def run_decode( mlp_layer.down_proj_dq_list]: l_weights = [] scales = [] - mins = [] + zeros = [] for l in layer_list: l_weights.append(l.weight) scales.append(l.scale) - if l.min is not None: - mins.append(l.min) - if len(mins): + if l.zero is not None: + zeros.append(l.zero) + if len(zeros): weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0), - torch.stack(mins, axis=0))) + torch.stack(zeros, axis=0))) else: weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0))) @@ -839,15 +839,15 @@ def run_prefill( mlp_layer.down_proj_dq_list]: l_weights = [] scales = [] - mins = [] + zeros = [] for l in layer_list: l_weights.append(l.weight) scales.append(l.scale) - if l.min is not None: - mins.append(l.min) - if len(mins): + if l.zero is not None: + zeros.append(l.zero) + if len(zeros): weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0), - torch.stack(mins, axis=0))) + torch.stack(zeros, axis=0))) else: weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0))) diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py index 28f4cb8dc4d..6eb6b2553b7 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py @@ -118,15 +118,15 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, mlp_layer.down_proj_dq_list]: l_weights = [] scales = [] - mins = [] + zeros = [] for l in layer_list: l_weights.append(l.weight) scales.append(l.scale) - if l.min is not None: - mins.append(l.min) - if len(mins): + if l.zero is not None: + zeros.append(l.zero) + if len(zeros): weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0), - torch.stack(mins, axis=0))) + torch.stack(zeros, axis=0))) else: weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0))) @@ -205,7 +205,7 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, f"model_{layer_idx}_input_{st_idx+3+idx*2+1}.bin") scale.numpy().tofile(bin_file) else: - for idx, (weight, scale, m) in enumerate(weights): + for idx, (weight, scale, zero) in enumerate(weights): bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*3}.bin") weight.numpy().tofile(bin_file) bin_file = os.path.join(weight_dir, @@ -213,7 +213,7 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, scale.numpy().tofile(bin_file) bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*3+2}.bin") - m.numpy().tofile(bin_file) + zero.numpy().tofile(bin_file) del single_decoder @@ -255,15 +255,15 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down mlp_layer.down_proj_dq_list]: l_weights = [] scales = [] - mins = [] + zeros = [] for l in layer_list: l_weights.append(l.weight) scales.append(l.scale) - if l.min is not None: - mins.append(l.min) - if len(mins): + if l.zero is not None: + zeros.append(l.zero) + if len(zeros): weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0), - torch.stack(mins, axis=0))) + torch.stack(zeros, axis=0))) else: weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0))) @@ -302,7 +302,7 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down f"model_{layer_idx}_input_{st_idx+3+idx*2+1}.bin") scale.numpy().tofile(bin_file) else: - for idx, (weight, scale, m) in enumerate(weights): + for idx, (weight, scale, zero) in enumerate(weights): bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*3}.bin") weight.numpy().tofile(bin_file) @@ -311,7 +311,7 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down scale.numpy().tofile(bin_file) bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*3+2}.bin") - m.numpy().tofile(bin_file) + zero.numpy().tofile(bin_file) if isinstance(weights[0], tuple): np_dtype = np.int8 if weights[0][0].dtype == torch.int8 else np.uint8 From ecdd8274b0b32ffdb10a62aee8d0d601ff353b76 Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Thu, 5 Dec 2024 15:04:09 +0800 Subject: [PATCH 16/21] support split linear for q4_1 --- .../src/ipex_llm/transformers/npu_model.py | 5 --- .../transformers/npu_models/convert_mp.py | 15 +++++--- .../transformers/npu_models/linear.py | 4 +-- .../transformers/npu_models/lm_head.py | 34 +++++++++++++----- .../transformers/npu_models/mp_models_base.py | 8 +++-- .../transformers/npu_pipeline_model/common.py | 2 ++ .../npu_pipeline_model/convert_pipeline.py | 15 ++++++-- .../transformers/npu_pipeline_model/qwen.py | 35 +++++++++++++++---- 8 files changed, 85 insertions(+), 33 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index 657daa0aa3e..9744e2f85f1 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -156,11 +156,6 @@ def from_pretrained(cls, *args, **kwargs): ) ) - if low_bit == "asym_int4": - invalidInputError(quantization_group_size == 0, - "asym_int4 only support quantization_group_size == 0 for now.") - - _args = copy.deepcopy(args) _kwargs = copy.deepcopy(kwargs) try: diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py index 0d1089963c9..4edfd8aee01 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py @@ -154,18 +154,21 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision, # workaround for MiniCPM-2B new_lm_head_0 = SlicedLMHead(model.lm_head_0.weight, split_num=split_num, bias=model.lm_head_0.bias, use_split=True, - group_size=quantization_group_size) + group_size=quantization_group_size, + asym=(qtype == "asym_int4_rtn")) del model.lm_head_0 model.lm_head_0 = new_lm_head_0 new_lm_head_1 = SlicedLMHead(model.lm_head_1.weight, split_num=split_num, bias=model.lm_head_1.bias, use_split=True, - group_size=quantization_group_size) + group_size=quantization_group_size, + asym=(qtype == "asym_int4_rtn")) del model.lm_head_1 model.lm_head_1 = new_lm_head_1 else: new_lm_head = SlicedLMHead(model.lm_head.weight, split_num=split_num, bias=model.lm_head.bias, use_split=True, - group_size=quantization_group_size) + group_size=quantization_group_size, + asym=(qtype == "asym_int4_rtn")) del model.lm_head model.lm_head = new_lm_head @@ -176,11 +179,13 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision, # Do not split lm_head and use sym_int8 instead when mixed_precison is True if quantization_group_size == 0: # Do not split lm_head and use sym_int8 instead when mixed_precison is True - is_split = (not mixed_precision) and qtype == "sym_int4_rtn" + is_split = (not mixed_precision) and qtype in ["sym_int4_rtn", "asym_int4_rtn"] split_num = 14 if is_split else 1 + print("enter here, split num is ", split_num) new_lm_head = SlicedLMHead(model.lm_head.weight, split_num=split_num, bias=model.lm_head.bias, use_split=True, - group_size=quantization_group_size) + group_size=quantization_group_size, + asym=(qtype == "asym_int4_rtn")) del model.lm_head model.lm_head = new_lm_head diff --git a/python/llm/src/ipex_llm/transformers/npu_models/linear.py b/python/llm/src/ipex_llm/transformers/npu_models/linear.py index eb9ae98c75e..c8a5dd467ae 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/linear.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/linear.py @@ -159,8 +159,10 @@ def __init__( ) ) self.outC, self.inC = self.weight.shape + self.zero = None if group_size != 0: self.scale = Parameter(scale, requires_grad=False) + self.zero = Parameter(zero, requires_grad=False) else: if self.weight.dtype == torch.uint8: # Int4 we need to double the input channels because weights are compressed @@ -168,8 +170,6 @@ def __init__( self.scale = Parameter(scale * math.sqrt(self.inC), requires_grad=False) if zero is not None: self.zero = Parameter(zero * math.sqrt(self.inC), requires_grad=False) - else: - self.zero = None self.bias = bias self.qtype = qtype self.op_id = str(uuid.uuid4()) diff --git a/python/llm/src/ipex_llm/transformers/npu_models/lm_head.py b/python/llm/src/ipex_llm/transformers/npu_models/lm_head.py index f306ae0e4e0..da1d8c9d37e 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/lm_head.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/lm_head.py @@ -36,6 +36,7 @@ def __init__( dtype: np.dtype = np.int8, use_split: bool = False, group_size: int = 0, + asym: bool = False, ): """Initialize the LMHeadLinear class. @@ -58,7 +59,7 @@ def __init__( if use_split: input = self.parameter((1, self.batch, self.inC)) res = self.dq_split_linear(input, self.split_num, self.outC, self.inC, wt_dtype=dtype, - scale_factor=(group_size == 0)) + scale_factor=(group_size == 0), asym=asym) else: input = self.parameter((self.batch, self.inC)) split_size = self.inC // split_num // 2 * 2 @@ -69,7 +70,7 @@ def __init__( input_slice = self.slice(input, begin=[0, start_idx], end=[self.batch, end_idx]) linear_slice = self.linear(input_slice, outC, split_size, bias=False, - wt_dtype=dtype) + wt_dtype=dtype, asym=asym) if i == 0: res = linear_slice else: @@ -109,7 +110,7 @@ def run( class SlicedLMHead(nn.Module): - def __init__(self, weight, bias, split_num, use_split=False, group_size=0): + def __init__(self, weight, bias, split_num, use_split=False, group_size=0, asym=False): super().__init__() self.split_num = split_num self.outC, self.inC = weight.shape @@ -128,6 +129,7 @@ def __init__(self, weight, bias, split_num, use_split=False, group_size=0): self.lm_heads.append(new_linear) self.bias = bias self.use_split = use_split + self.asym = asym def forward(self, hidden_states): if hidden_states.size(0) * hidden_states.size(1) == 1: @@ -162,19 +164,33 @@ def get_fused_lm_head(self): np_dtype = np.uint8 if self.get_weight_dtype() == torch.uint8 else np.int8 self.fused_lm_head = LMHeadLinear(self.inC, self.outC, 1, self.split_num, False, "NPU", dtype=np_dtype, use_split=self.use_split, - group_size=self.group_size) + group_size=self.group_size, asym=self.asym) if self.use_split: weights = [] scales = [] + zeros = [] for i in range(self.split_num): weights.append(self.lm_heads[i].weight) scales.append(self.lm_heads[i].scale) - fused_lm_head_weights = (torch.stack(weights, axis=0).numpy(), - torch.stack(scales, axis=0).numpy()) + if self.asym: + zeros.append(self.lm_heads[i].zero) + if self.asym: + fused_lm_head_weights = (torch.stack(weights, axis=0).numpy(), + torch.stack(scales, axis=0).numpy(), + torch.stack(zeros, axis=0).numpy(),) + else: + fused_lm_head_weights = (torch.stack(weights, axis=0).numpy(), + torch.stack(scales, axis=0).numpy()) else: - fused_lm_head_weights = [(self.lm_heads[i].weight.data.numpy(), - self.lm_heads[i].scale.data.numpy()) - for i in range(self.split_num)] + if self.asym: + fused_lm_head_weights = [(self.lm_heads[i].weight.data.numpy(), + self.lm_heads[i].scale.data.numpy(), + self.lm_heads[i].zero.data.numpy()) + for i in range(self.split_num)] + else: + fused_lm_head_weights = [(self.lm_heads[i].weight.data.numpy(), + self.lm_heads[i].scale.data.numpy()) + for i in range(self.split_num)] self.fused_lm_head.set_weights(self.lm_heads[0].op_id, fused_lm_head_weights) diff --git a/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py b/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py index 3b9ef4d2de4..a1dac609243 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py @@ -585,7 +585,8 @@ def linear(self, output_channels, input_channels, bias=bias, act_dtype=act_dtype, wt_dtype=wt_dtype, scale_factor=scale_factor, - is_prefill=is_prefill) + is_prefill=is_prefill, + asym=asym) self.linear_ops.append(op) return op @@ -597,10 +598,11 @@ def dq_split_linear(self, act_dtype: npt.DTypeLike = np.float16, wt_dtype: npt.DTypeLike = np.float16, scale_factor: bool = False, - is_prefill: bool = False): + is_prefill: bool = False, + asym: bool = False): op = super().dq_split_linear(input_node, n_splits, output_channels, input_channels, False, act_dtype, wt_dtype, scale_factor, - is_prefill=is_prefill) + is_prefill=is_prefill, asym=asym) self.linear_ops.append(op) return op diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py index b38299473d4..87459a99e98 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py @@ -86,6 +86,7 @@ def __init__( device: str = "NPU", n_splits: int = 1, group_size: int = 0, + asym: bool = False ): super().__init__(max_seq_len=max_seq_len, transpose_value=transpose_value, @@ -119,6 +120,7 @@ def __init__( hidden_states, self.vocab_size, self.hidden_size, bias=False, wt_dtype=self.dtype, n_splits=n_splits, scale_factor=(group_size == 0), + asym=asym ) # define outputs diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py index ee773614b63..337736a7ea8 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py @@ -434,6 +434,12 @@ def convert_llm_for_deploy(model: torch.nn.Module, os.mkdir(weight_dir) layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1" + lm_head_low_bit = getattr(model.config, "bigdl_transformers_low_bit", "sym_int4_rtn") + if not isinstance(model.lm_head, SlicedLMHead): + lm_head_low_bit = model.lm_head.qtype + else: + lm_head_low_bit = model.lm_head.lm_heads[0].qtype + if model.config.model_type == "qwen2": if group_size == 0: if model.config.hidden_size == 1536: @@ -456,7 +462,8 @@ def convert_llm_for_deploy(model: torch.nn.Module, "weight_num": 7, "weight_idx": 8, "n_splits_linear": n_splits_linear, - "n_splits_down_proj": n_splits_down_proj} + "n_splits_down_proj": n_splits_down_proj, + "lm_head_low_bit": lm_head_low_bit} model.config.update(update_dict) model.config.save_pretrained(save_directory) @@ -517,7 +524,8 @@ def convert_llm_for_deploy(model: torch.nn.Module, "embedding_post": embedding_post, "cos_sin_input": cos_sin_input, "n_splits_linear": n_splits_linear, - "n_splits_down_proj": n_splits_down_proj} + "n_splits_down_proj": n_splits_down_proj, + "lm_head_low_bit": lm_head_low_bit} model.config.update(update_dict) model.config.save_pretrained(save_directory) @@ -556,7 +564,8 @@ def convert_llm_for_deploy(model: torch.nn.Module, "model_type": "minicpm", "embedding_post": True, "n_splits_linear": n_splits_linear, - "n_splits_down_proj": n_splits_down_proj} + "n_splits_down_proj": n_splits_down_proj, + "lm_head_low_bit": lm_head_low_bit} model.config.update(update_dict) model.config.save_pretrained(save_directory) diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py index 6eb6b2553b7..38c86a63101 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py @@ -31,17 +31,33 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir, model_norm = model.model.norm lm_head = model.lm_head lm_head_n_splits = 1 + asym = getattr(model.config, "asym", False) + if not isinstance(lm_head, SlicedLMHead): - weights = [(lm_head.weight, lm_head.scale)] + asym = lm_head.qtype == "asym_int4_rtn" + if asym: + weights = [(lm_head.weight, lm_head.scale, lm_head.zero)] + else: + weights = [(lm_head.weight, lm_head.scale)] else: lm_heads = lm_head.lm_heads + asym = lm_heads[0].qtype == "asym_int4_rtn" + print("asym is ", asym, lm_heads[0].qtype) lm_head_weights = [] scales = [] + zeros = [] for l in lm_heads: lm_head_weights.append(l.weight) scales.append(l.scale) - weights = [(torch.stack(lm_head_weights, axis=0), - torch.stack(scales, axis=0))] + if l.zero is not None: + zeros.append(l.zero) + if len(zeros): + weights = [(torch.stack(lm_head_weights, axis=0), + torch.stack(scales, axis=0), + torch.stack(zeros, axis=0))] + else: + weights = [(torch.stack(lm_head_weights, axis=0), + torch.stack(scales, axis=0))] lm_head_n_splits = lm_head.split_num if isinstance(weights[0], tuple): np_dtype = np.int8 if weights[0][0].dtype == torch.int8 else np.uint8 @@ -60,6 +76,7 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir, vocab_size=vocab_size, n_splits=lm_head_n_splits, group_size=group_size, + asym=asym ) last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, f"lm_head", @@ -67,9 +84,15 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir, # save weights bins files if not isinstance(lm_head, SlicedLMHead): - weight_numpy = [ - lm_head.weight.data.numpy(), lm_head.scale.data.numpy(), - ] + if not asym: + weight_numpy = [ + lm_head.weight.data.numpy(), lm_head.scale.data.numpy(), + ] + else: + weight_numpy = [ + lm_head.weight.data.numpy(), lm_head.scale.data.numpy(), + lm_head.zero.data.numpy() + ] else: weight_numpy = [v.numpy() for v in weights[0]] From b1a8e81c77b2033716f4ba0a668eec1af9d62aaa Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Thu, 5 Dec 2024 16:06:33 +0800 Subject: [PATCH 17/21] fix lm_head with mixed_precision=True --- .../src/ipex_llm/transformers/npu_model.py | 46 +++++++++---------- .../transformers/npu_models/convert_mp.py | 3 +- .../transformers/npu_models/lm_head.py | 15 +++--- 3 files changed, 31 insertions(+), 33 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index 9744e2f85f1..983c6393496 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -284,29 +284,29 @@ def optimize_npu_model(cls, *args, **kwargs): model.share_memory() if not pipeline: - if (not hasattr(model, 'llm') and - model.config.model_type in ["qwen2", "llama", "minicpm"]): - from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process - optimize_llm_single_process( - llm, - kv_len=max_context_len, - max_prompt_len=max_prompt_len, - transpose_value_cache=transpose_value_cache, - group_size=quantization_group_size, - qtype=qtype, - save_directory=save_directory, - fuse_layers=fuse_layers - ) - else: - optimize_llm( - llm, - max_context_len=max_context_len, - max_prompt_len=max_prompt_len, - inter_pp=inter_pp, - intra_pp=intra_pp, - transpose_value_cache=transpose_value_cache, - group_size=quantization_group_size - ) + # if (not hasattr(model, 'llm') and + # model.config.model_type in ["qwen2", "llama", "minicpm"]): + # from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process + # optimize_llm_single_process( + # llm, + # kv_len=max_context_len, + # max_prompt_len=max_prompt_len, + # transpose_value_cache=transpose_value_cache, + # group_size=quantization_group_size, + # qtype=qtype, + # save_directory=save_directory, + # fuse_layers=fuse_layers + # ) + # else: + optimize_llm( + llm, + max_context_len=max_context_len, + max_prompt_len=max_prompt_len, + inter_pp=inter_pp, + intra_pp=intra_pp, + transpose_value_cache=transpose_value_cache, + group_size=quantization_group_size + ) else: from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \ import convert_llm diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py index 4edfd8aee01..12a4da5b512 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py @@ -181,11 +181,10 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision, # Do not split lm_head and use sym_int8 instead when mixed_precison is True is_split = (not mixed_precision) and qtype in ["sym_int4_rtn", "asym_int4_rtn"] split_num = 14 if is_split else 1 - print("enter here, split num is ", split_num) new_lm_head = SlicedLMHead(model.lm_head.weight, split_num=split_num, bias=model.lm_head.bias, use_split=True, group_size=quantization_group_size, - asym=(qtype == "asym_int4_rtn")) + asym=(qtype == "asym_int4_rtn") and (not mixed_precision)) del model.lm_head model.lm_head = new_lm_head diff --git a/python/llm/src/ipex_llm/transformers/npu_models/lm_head.py b/python/llm/src/ipex_llm/transformers/npu_models/lm_head.py index da1d8c9d37e..0184805996b 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/lm_head.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/lm_head.py @@ -55,7 +55,6 @@ def __init__( self.batch = batch self.split_num = split_num - if use_split: input = self.parameter((1, self.batch, self.inC)) res = self.dq_split_linear(input, self.split_num, self.outC, self.inC, wt_dtype=dtype, @@ -172,15 +171,15 @@ def get_fused_lm_head(self): for i in range(self.split_num): weights.append(self.lm_heads[i].weight) scales.append(self.lm_heads[i].scale) - if self.asym: + if self.lm_heads[i].zero is not None: zeros.append(self.lm_heads[i].zero) - if self.asym: - fused_lm_head_weights = (torch.stack(weights, axis=0).numpy(), - torch.stack(scales, axis=0).numpy(), - torch.stack(zeros, axis=0).numpy(),) + if len(zeros): + fused_lm_head_weights = [(torch.stack(weights, axis=0).numpy(), + torch.stack(scales, axis=0).numpy(), + torch.stack(zeros, axis=0).numpy())] else: - fused_lm_head_weights = (torch.stack(weights, axis=0).numpy(), - torch.stack(scales, axis=0).numpy()) + fused_lm_head_weights = [(torch.stack(weights, axis=0).numpy(), + torch.stack(scales, axis=0).numpy())] else: if self.asym: fused_lm_head_weights = [(self.lm_heads[i].weight.data.numpy(), From 4e56e795f689fd36d84fcb726c660f8384c0c9d6 Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Thu, 5 Dec 2024 16:48:36 +0800 Subject: [PATCH 18/21] fix style --- python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py index 12a4da5b512..64d6f30b160 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py @@ -184,7 +184,8 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision, new_lm_head = SlicedLMHead(model.lm_head.weight, split_num=split_num, bias=model.lm_head.bias, use_split=True, group_size=quantization_group_size, - asym=(qtype == "asym_int4_rtn") and (not mixed_precision)) + asym=((qtype == "asym_int4_rtn") and + (not mixed_precision))) del model.lm_head model.lm_head = new_lm_head From 558e10100e167cd71deb4206a0436e657688c88f Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Thu, 5 Dec 2024 16:54:05 +0800 Subject: [PATCH 19/21] revert test code --- .../src/ipex_llm/transformers/npu_model.py | 46 +++++++++---------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index 983c6393496..9744e2f85f1 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -284,29 +284,29 @@ def optimize_npu_model(cls, *args, **kwargs): model.share_memory() if not pipeline: - # if (not hasattr(model, 'llm') and - # model.config.model_type in ["qwen2", "llama", "minicpm"]): - # from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process - # optimize_llm_single_process( - # llm, - # kv_len=max_context_len, - # max_prompt_len=max_prompt_len, - # transpose_value_cache=transpose_value_cache, - # group_size=quantization_group_size, - # qtype=qtype, - # save_directory=save_directory, - # fuse_layers=fuse_layers - # ) - # else: - optimize_llm( - llm, - max_context_len=max_context_len, - max_prompt_len=max_prompt_len, - inter_pp=inter_pp, - intra_pp=intra_pp, - transpose_value_cache=transpose_value_cache, - group_size=quantization_group_size - ) + if (not hasattr(model, 'llm') and + model.config.model_type in ["qwen2", "llama", "minicpm"]): + from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process + optimize_llm_single_process( + llm, + kv_len=max_context_len, + max_prompt_len=max_prompt_len, + transpose_value_cache=transpose_value_cache, + group_size=quantization_group_size, + qtype=qtype, + save_directory=save_directory, + fuse_layers=fuse_layers + ) + else: + optimize_llm( + llm, + max_context_len=max_context_len, + max_prompt_len=max_prompt_len, + inter_pp=inter_pp, + intra_pp=intra_pp, + transpose_value_cache=transpose_value_cache, + group_size=quantization_group_size + ) else: from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \ import convert_llm From f1f1bb2953d2fa6c96bfe073e94f57be5e54a706 Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Thu, 5 Dec 2024 17:14:11 +0800 Subject: [PATCH 20/21] add down proj back for q4_0 --- python/llm/src/ipex_llm/transformers/npu_models/convert.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert.py b/python/llm/src/ipex_llm/transformers/npu_models/convert.py index 4aa45b0054a..2842799b160 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/convert.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/convert.py @@ -93,6 +93,10 @@ def replace_with_QuantizedLinear(layer, qtype, device, modules_to_not_convert, if (layer.in_features == 3584 and layer.out_features == 152064): qtype = "sym_int8_rtn" iqtype = ggml_tensor_qtype[qtype] + if qtype == "sym_int4_rtn": + if (layer.in_features == 18944 and layer.out_features == 3584): + qtype = "sym_int8_rtn" + iqtype = ggml_tensor_qtype[qtype] enable_scale_search = os.environ.get("IPEX_LLM_NPU_QUANTIZATION_OPT", "0") != "0" qweights, scale = ggml_convert_qtype(layer.weight.data.to(torch.float32), iqtype, device=device, From 17a896fd446c8ce59e006ddd29fac22335b8480e Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Thu, 5 Dec 2024 17:22:51 +0800 Subject: [PATCH 21/21] remove print --- python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py index 38c86a63101..bb8003f06a7 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py @@ -42,7 +42,6 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir, else: lm_heads = lm_head.lm_heads asym = lm_heads[0].qtype == "asym_int4_rtn" - print("asym is ", asym, lm_heads[0].qtype) lm_head_weights = [] scales = [] zeros = []