Replace itrex qbits to ipex woq linear (#549)

Co-authored-by: Casper <[email protected]>
casper-hansen · Sep 12, 2024 · eab1a4a · eab1a4a
1 parent d5ec43d
commit eab1a4a
Show file tree

Hide file tree

Showing 17 changed files with 279 additions and 326 deletions.
diff --git a/README.md b/README.md
@@ -228,26 +228,25 @@ GPU: 2x NVIDIA GeForce RTX 4090
 
 ### CPU
 
-- CPU: INTEL(R) XEON(R) PLATINUM 8592+ with 8-channel 4800MT/s memory.
+- CPU: 48 cores SPR (Intel 4th Gen Xeon CPU)
 - Command: `python examples/benchmark.py --model_path <hf_model> --batch_size 1`
 
-|   Model | Size | Batch Size | Prefill Length | Decode Length | Prefill tokens/s | Decode tokens/s | Memory (RAM) |
-|--------:|------:|-----------:|-------------:|-----------------:|----------------:|---------------:|:------------------|
-| Mixtral |   7B | 1          | 64             | 64            | 389.24           | 16.01           | 5.59 GB (0.02%) |
-| Mixtral |   7B | 1          | 2048             | 2048            | 1412           | 17.76         | 6.29 GB (0.03%) |
-| Vicuna  |   7B | 1          | 64             | 64            | 346           | 18.13         | 8.18 GB (0.03%) |
-| Vicuna  |   7B | 1          | 2048             | 2048            | 1023.4           | 18.18         | 8.80 GB (0.04%) |
-| LLaMA2  |   13B | 1          | 64             | 64            | 160.24           | 9.87         | 14.65 GB (0.06%) |
-| LLaMA2  |   13B | 1          | 2048             | 2048            | 592.35           | 9.93         | 16.87 GB (0.07%) |
-| Mosaicml  | 7B | 1          | 64             | 64            | 433.17           | 18.79         | 4.60 GB (0.02%) |
-| Mosaicml  | 7B | 1          | 2048             | 2048            | 404.25           | 19.91         | 4.75 GB (0.02%) |
-| Falcon  | 7B | 1          | 64             | 64            | 303.16           | 14.41         | 5.18 GB (0.02%) |
-| Falcon  | 7B | 1          | 2048             | 2048            | 634.57           | 15.55         | 5.80 GB (0.02%) |
-| CodeLlama  | 34B | 1          | 64             | 64            | 153.73           | 4.23         | 29.00 GB (0.12%) |
-| CodeLlama  | 34B | 1          | 2048             | 2048            | 274.25           | 4.38         | 35.21 GB (0.15%) |
-| Deepseek-coder  | 33B | 1          | 64             | 64            | 83.08           | 4.07         | 22.16 GB (0.09%) |
-| Deepseek-coder  | 33B | 1          | 2048             | 2048            | 296.04           | 4.33         | 37.05 GB |
-
+| Model | Version | Batch Size | Prefill Length | Decode Length | Prefill tokens/s | Decode tokens/s | Memory |
+|-------|---------|------------|----------------|---------------|-------------------|------------------|---------------|
+| Llama 2 7B | gemm | 1 | 32 | 32 | 817.86 | 70.93 | 1.94 GB (0.00%) |
+| Llama 2 7B | gemm | 1 | 2048 | 2048 | 5279.15 | 36.83 | 2.31 GB (0.00%) |
+| Falcon | gemm | 1 | 32 | 32 | 337.51 | 26.41 | 9.57 GB (0.01%) |
+| Falcon | gemm | 1 | 2048 | 2048 | 546.71 | 18.8 | 13.46 GB (0.01%) |
+| Mistral | gemm | 1 | 32 | 32 | 343.08 | 28.46 | 9.74 GB (0.01%) |
+| Mistral | gemm | 1 | 2048 | 2048 | 1135.23 | 13.23 | 10.35 GB (0.01%) |
+| Vicuna | gemm | 1 | 32 | 32 | 340.73 | 28.86 | 9.59 GB (0.01%) |
+| Vicuna | gemm | 1 | 2048 | 2048 | 1143.19 | 11.14 | 10.98 GB (0.01%) |
+| Llama 2 13B | gemm | 1 | 32 | 32 | 220.79 | 18.14 | 17.46 GB (0.02%) |
+| Llama 2 13B | gemm | 1 | 2048 | 2048 | 650.94 | 6.54 | 19.84 GB (0.02%) |
+| DeepSeek Coder 33B | gemm | 1 | 32 | 32 | 101.61 | 8.58 | 40.80 GB (0.04%) |
+| DeepSeek Coder 33B | gemm | 1 | 2048 | 2048 | 245.02 | 3.48 | 41.72 GB (0.04%) |
+| Phind CodeLlama 34B | gemm | 1 | 32 | 32 | 102.47 | 9.04 | 41.70 GB (0.04%) |
+| Phind CodeLlama 34B | gemm | 1 | 2048 | 2048 | 237.57 | 3.48 | 42.47 GB (0.04%) |
 
 ## Reference
 

diff --git a/awq/models/auto.py b/awq/models/auto.py
@@ -88,7 +88,7 @@ def from_quantized(
         fuse_layers=True,
         use_exllama=False,
         use_exllama_v2=False,
-        use_qbits=False,
+        use_ipex=False,
         batch_size=1,
         safetensors=True,
         device_map="balanced",
@@ -116,7 +116,7 @@ def from_quantized(
             fuse_layers=fuse_layers,
             use_exllama=use_exllama,
             use_exllama_v2=use_exllama_v2,
-            use_qbits=use_qbits,
+            use_ipex=use_ipex,
             safetensors=safetensors,
             device_map=device_map,
             max_memory=max_memory,

diff --git a/awq/models/base.py b/awq/models/base.py
@@ -17,23 +17,23 @@
 from awq.modules.linear import (
     WQLinear_GEMM,
     WQLinear_GEMV,
-    WQLinear_QBits,
+    WQLinear_IPEX,
     WQLinear_Marlin,
     WQLinear_Exllama,
     WQLinear_ExllamaV2,
     WQLinear_GEMVFast,
     marlin_post_init,
     exllama_post_init,
     exllamav2_post_init,
-    qbits_post_init,
+    ipex_post_init,
 )
 from awq.utils.module import (
     get_named_linears,
     set_op_by_name,
     exclude_layers_to_not_quantize,
     try_import,
 )
-from awq.utils.utils import get_best_device, qbits_available
+from awq.utils.utils import get_best_device, ipex_available
 from transformers import (
     AutoConfig,
     PreTrainedModel,
@@ -52,9 +52,6 @@
 from awq.quantize.quantizer import AwqQuantizer
 from awq.utils.module import get_named_linears, set_op_by_name
 
-if qbits_available:
-    from intel_extension_for_transformers.qbits import check_isa_supported
-
 
 # Since we support different `AutoModelForxxx` from transformers
 # we need to define a custom mapping dict as below:
@@ -440,8 +437,8 @@ def from_quantized(
         use_exllama_v2: Annotated[
             bool, Doc("Whether to map the weights to ExLlamaV2 kernels.")
         ] = False,
-        use_qbits: Annotated[
-            bool, Doc("Whether to map the weights to qbits kernels for CPU device.")
+        use_ipex: Annotated[
+            bool, Doc("Whether to map the weights to ipex kernels for CPU device.")
         ] = False,
         device_map: Annotated[
             Union[str, Dict],
@@ -494,17 +491,11 @@ def from_quantized(
                 trust_remote_code=trust_remote_code,
             )
 
-        use_cpu_qbits = use_qbits or get_best_device() == "cpu"
-        if use_cpu_qbits:
-            if not qbits_available:
-                raise ImportError(
-                    "Please install intel-extension-for-transformers with "
-                    "`pip install intel-extension-for-transformers` for 'qbits' kernel!"
-                )
-
-            fuse_layers = False
-            logging.warn(
-                "Unsupport fuse_layers featrue for CPU device with QBits backend!"
+        use_cpu_ipex = use_ipex or get_best_device() == "cpu"
+        if use_cpu_ipex and not ipex_available:
+            raise ImportError(
+                "Please install intel_extension_for_pytorch with "
+                "`pip install intel_extension_for_pytorch` for 'ipex' kernel!"
             )
         # Prepare WQLinear layers, replace nn.Linear
         self._load_quantized_modules(
@@ -514,7 +505,7 @@ def from_quantized(
             quant_config.version,
             use_exllama=use_exllama,
             use_exllama_v2=use_exllama_v2,
-            use_qbits=use_cpu_qbits,
+            use_ipex=use_cpu_ipex,
         )
 
         model.tie_weights()
@@ -539,11 +530,11 @@ def from_quantized(
             else:
                 self.fuse_layers(model)
 
-        if use_cpu_qbits:
-            dtype = torch.bfloat16 if check_isa_supported("AMX") else torch.float32
+        if use_cpu_ipex:
+            dtype = torch.bfloat16
             model.to(dtype=dtype, device="cpu")
-            # repack qweight to match the QBits kernel.
-            model = qbits_post_init(model)
+            # repack qweight to match the ipex kernel.
+            model = ipex_post_init(model)
         elif quant_config.version == "marlin":
             model = marlin_post_init(model)
         elif use_exllama:
@@ -631,11 +622,11 @@ def _load_config(
         return model_weights_path, config, quant_config
 
     def _load_quantized_modules(
-        self, model, quant_config, version, use_exllama, use_exllama_v2, use_qbits=False
+        self, model, quant_config, version, use_exllama, use_exllama_v2, use_ipex=False
     ):
         # Real quantization of weights
         assert not (
-            version == "gemv" and (use_exllama or use_exllama_v2 or use_qbits)
+            version == "gemv" and (use_exllama or use_exllama_v2 or use_ipex)
         ), "Exllama kernels only support GEMM version."
 
         # Get blocks of model
@@ -657,8 +648,8 @@ def _load_quantized_modules(
 
             # Replace nn.Linear with WQLinear
             for name, module in named_linears.items():
-                if use_qbits:
-                    q_linear_module = WQLinear_QBits
+                if use_ipex:
+                    q_linear_module = WQLinear_IPEX
                 elif version == "marlin":
                     q_linear_module = WQLinear_Marlin
                 elif use_exllama:
@@ -672,7 +663,7 @@ def _load_quantized_modules(
                 elif version == "gemv_fast":
                     q_linear_module = WQLinear_GEMVFast
 
-                if use_qbits:
+                if use_ipex:
                     q_linear = q_linear_module.from_linear(
                         module,
                         quant_config.w_bit,
@@ -687,7 +678,7 @@ def _load_quantized_modules(
                 q_linear.to(next(layer.parameters()).device)
                 set_op_by_name(layer, name, q_linear)
 
-            if not use_qbits:
+            if not use_ipex:
                 torch.cuda.empty_cache()
             gc.collect()
 

diff --git a/awq/modules/fused/attn.py b/awq/modules/fused/attn.py
@@ -176,6 +176,7 @@ def __init__(
             self.is_neox = kwargs["is_neox"]
 
         self.attn_logit_softcapping = attn_logit_softcapping
+        self.use_sdpa = kwargs.get("use_sdpa", False)
 
     def forward(
         self, hidden_states: torch.Tensor, attention_mask=None, *args, **kwargs
@@ -266,29 +267,44 @@ def forward(
             xq = xq.transpose(1, 2)
             keys = keys.transpose(1, 2)
             values = values.transpose(1, 2)
-            scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(self.head_dim)
 
             # Used in Gemma2
             if self.attn_logit_softcapping is not None:
                 scores = scores / self.attn_logit_softcapping
                 scores = torch.tanh(scores)
                 scores = scores * self.attn_logit_softcapping
 
-            if self.use_alibi:
-                scores = self.alibi.forward(scores, seqlen)
-
-            # When seqlen is 1, there is nothing else to attend to
-            if attention_mask is not None and seqlen > 1:
-                # For llama-arch, the causal mask is preallocated with bsz x 1 x max_seq_len x max_seq_len, thus we 
-                # need to slice it
-                if attention_mask.shape[-1] != seqlen:
-                    attention_mask = attention_mask[:, :, :seqlen, :seqlen]
-
-                scores = (
-                    scores + attention_mask
-                )  # (bs, n_local_heads, slen, cache_len + slen)
-            scores = F.softmax(scores.float(), dim=-1).type_as(xq)
-            output = torch.matmul(scores, values)  # (bs, n_local_heads, slen, head_dim)
+            if self.use_sdpa:
+                causal_mask = attention_mask
+                if attention_mask is not None:
+                    causal_mask = causal_mask[:, :, :, : keys.shape[-2]]
+                is_causal = True if causal_mask is None and seqlen > 1 else False
+                output = torch.nn.functional.scaled_dot_product_attention(
+                    xq,
+                    keys,
+                    values,
+                    attn_mask=causal_mask,
+                    dropout_p=0.0,
+                    is_causal=is_causal,
+                )
+            else:
+                scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(self.head_dim)
+                if self.use_alibi:
+                    scores = self.alibi.forward(scores, seqlen)
+
+                # When seqlen is 1, there is nothing else to attend to
+                if attention_mask is not None and seqlen > 1:
+                    # For llama-arch, the causal mask is preallocated with bsz x 1 x max_seq_len x max_seq_len, thus we 
+                    # need to slice it
+                    if attention_mask.shape[-1] != seqlen:
+                        attention_mask = attention_mask[:, :, :seqlen, :seqlen]
+
+                    scores = (
+                        scores + attention_mask
+                    )  # (bs, n_local_heads, slen, cache_len + slen)
+                scores = F.softmax(scores.float(), dim=-1).type_as(xq)
+                output = torch.matmul(scores, values)  # (bs, n_local_heads, slen, head_dim)
+
             attention_weight = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
         else:
             xq = xq.view((bsz,) + self.attention_shapes["single_xq_view"])

diff --git a/awq/modules/fused/block.py b/awq/modules/fused/block.py
@@ -106,6 +106,7 @@ def __init__(
             rope_theta=rope_theta,
             partial_rotary_factor=partial_rotary_factor,
             head_dim=head_dim,
+            use_sdpa=True,
         ).to(dev)
         self.norm_2 = norm_2.to(dev)
         self.mlp = mlp.to(dev)

diff --git a/awq/modules/fused/norm.py b/awq/modules/fused/norm.py
@@ -8,6 +8,13 @@
 except:
     AWQ_INSTALLED = False
 
+try:
+    import intel_extension_for_pytorch as ipex  # with IPEX kernels
+
+    IPEX_INSTALLED = True
+except:
+    IPEX_INSTALLED = False
+
 
 class FasterTransformerRMSNorm(nn.Module):
     def __init__(self, weight, eps=1e-6):
@@ -16,12 +23,14 @@ def __init__(self, weight, eps=1e-6):
         self.variance_epsilon = eps
 
     def forward(self, x):
-        assert AWQ_INSTALLED, (
-            "AWQ kernels could not be loaded. "
-            "Please install them from https://github.com/casper-hansen/AutoAWQ_kernels"
-        )
-
-        output = torch.empty_like(x)
-        awq_ext.layernorm_forward_cuda(x, self.weight, output, self.variance_epsilon)
+        if IPEX_INSTALLED:
+            output = ipex.llm.functional.rms_norm(x, self.weight, self.variance_epsilon)
+        else:
+            assert AWQ_INSTALLED, (
+                "AWQ kernels could not be loaded. "
+                "Please install them from https://github.com/casper-hansen/AutoAWQ_kernels"
+            )
+            output = torch.empty_like(x)
+            awq_ext.layernorm_forward_cuda(x, self.weight, output, self.variance_epsilon)
 
         return output
diff --git a/awq/modules/linear/__init__.py b/awq/modules/linear/__init__.py
@@ -1,7 +1,7 @@
 from .exllama import WQLinear_Exllama, exllama_post_init
 from .exllamav2 import WQLinear_ExllamaV2, exllamav2_post_init
 from .gemm import WQLinear_GEMM
-from .gemm_qbits import WQLinear_QBits, qbits_post_init
+from .gemm_ipex import WQLinear_IPEX, ipex_post_init
 from .gemv import WQLinear_GEMV
 from .marlin import WQLinear_Marlin, marlin_post_init
 from .gemv_fast import WQLinear_GEMVFast