LLM: support iq1s for llama2-70b-hf (#10596)

intel-analytics · Apr 1, 2024 · bfc1caa · bfc1caa
1 parent d6af487
commit bfc1caa
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 11 deletions.
diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py
@@ -192,7 +192,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
                                  convert_shape_only=False,
                                  cpu_embedding=False, prefix_name='',
                                  imatrix_data=None, embedding_qtype=None,
-                                 model_type=None, torch_dtype=torch.float32,
+                                 model_config=None, torch_dtype=torch.float32,
                                  enable_xetla=False):
     from ipex_llm.transformers.low_bit_linear import LowBitLinear, FP4Params, \
         FP16Linear, BF16Linear
@@ -211,6 +211,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
             in_features, out_features, mp_group = linear_args
             optimize_lm_head = False
             if name == "lm_head":
+                model_type = getattr(model_config, "model_type", None)
                 if model_type in ["gptj", "llama"] and os.environ.get("BIGDL_OPTIMIZE_LM_HEAD",
                                                                       None) == "1":
                     optimize_lm_head = True
@@ -262,7 +263,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
                     cur_qtype, cur_imatrix = get_cur_qtype_and_imatrix(qtype,
                                                                        full_module_name,
                                                                        imatrix_data,
-                                                                       model_type)
+                                                                       model_config)
                     device = module.weight.data.device
                     # Copy the weights
                     paramsLowBit = FP4Params(data=module.weight.data,
@@ -378,7 +379,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
                 prefix_name=prefix_name + '.' + name if prefix_name != '' else name,
                 imatrix_data=imatrix_data,
                 embedding_qtype=embedding_qtype,
-                model_type=model_type,
+                model_config=model_config,
                 torch_dtype=torch_dtype,
                 enable_xetla=enable_xetla,
             )
@@ -652,17 +653,13 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True,
     if optimize_model:
         model = _optimize_pre(model)
 
-    # mixed quantization needs model_type to choose custom quantization strategy
-    if hasattr(model, "config"):
-        model_type = getattr(model.config, "model_type", None)
-    else:
-        model_type = None
+    # mixed quantization needs model_config to choose custom quantization strategy
     model, has_been_replaced = _replace_with_low_bit_linear(
         model, qtype, modules_to_not_convert,
         convert_shape_only, cpu_embedding,
         imatrix_data=imatrix_data,
         embedding_qtype=embedding_qtype,
-        model_type=model_type,
+        model_config=getattr(model, "config", None),
         torch_dtype=torch_dtype,
         enable_xetla=enable_xetla,
     )

diff --git a/python/llm/src/ipex_llm/transformers/utils.py b/python/llm/src/ipex_llm/transformers/utils.py
@@ -267,8 +267,12 @@ def module_name_process(full_module_name):
     return new_module_name, layer, cur_module
 
 
-def get_cur_qtype_and_imatrix(qtype, full_module_name, imatrix_data, model_type=None):
+def get_cur_qtype_and_imatrix(qtype, full_module_name, imatrix_data, model_config=None):
     cur_qtype = qtype
+    if model_config is not None:
+        model_type = getattr(model_config, "model_type", None)
+    else:
+        model_dtype = None
     if qtype in [ggml_tensor_qtype["gguf_iq2_xxs"], ggml_tensor_qtype["gguf_iq2_xs"],
                  ggml_tensor_qtype["gguf_iq1_s"]]:
         # For quantization which needs importance matrix
@@ -281,7 +285,15 @@ def get_cur_qtype_and_imatrix(qtype, full_module_name, imatrix_data, model_type=
             elif cur_module == 'down' and int(layer) in [0, 1, 2, 3]:
                 cur_qtype = ggml_tensor_qtype['q2_k']
         else:
-            if cur_module == 'v' or (cur_module == 'down' and int(layer) in [0, 1, 10, 11]):
+            num_hidden_layers = getattr(model_config, "num_hidden_layers", None)
+            hidden_size = getattr(model_config, "hidden_size", None)
+            if model_type == "llama" and hidden_size == 8192:
+                # for llama2-70b
+                if cur_module == 'v':
+                    cur_qtype = ggml_tensor_qtype['sym_int4']  # llama.cpp use q4k here
+                if cur_module == 'down' and int(layer) < int(num_hidden_layers/8):
+                    cur_qtype = ggml_tensor_qtype['q2_k']
+            elif cur_module == 'v' or (cur_module == 'down' and int(layer) in [0, 1, 10, 11]):
                 cur_qtype = ggml_tensor_qtype['q2_k']
             if qtype == ggml_tensor_qtype["gguf_iq1_s"] and cur_module == 'o':
                 cur_qtype = ggml_tensor_qtype['gguf_iq2_xxs']