Skip to content

Commit

Permalink
LLM: support iq1s for llama2-70b-hf (#10596)
Browse files Browse the repository at this point in the history
  • Loading branch information
rnwang04 authored Apr 1, 2024
1 parent d6af487 commit bfc1caa
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 11 deletions.
15 changes: 6 additions & 9 deletions python/llm/src/ipex_llm/transformers/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
convert_shape_only=False,
cpu_embedding=False, prefix_name='',
imatrix_data=None, embedding_qtype=None,
model_type=None, torch_dtype=torch.float32,
model_config=None, torch_dtype=torch.float32,
enable_xetla=False):
from ipex_llm.transformers.low_bit_linear import LowBitLinear, FP4Params, \
FP16Linear, BF16Linear
Expand All @@ -211,6 +211,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
in_features, out_features, mp_group = linear_args
optimize_lm_head = False
if name == "lm_head":
model_type = getattr(model_config, "model_type", None)
if model_type in ["gptj", "llama"] and os.environ.get("BIGDL_OPTIMIZE_LM_HEAD",
None) == "1":
optimize_lm_head = True
Expand Down Expand Up @@ -262,7 +263,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
cur_qtype, cur_imatrix = get_cur_qtype_and_imatrix(qtype,
full_module_name,
imatrix_data,
model_type)
model_config)
device = module.weight.data.device
# Copy the weights
paramsLowBit = FP4Params(data=module.weight.data,
Expand Down Expand Up @@ -378,7 +379,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
prefix_name=prefix_name + '.' + name if prefix_name != '' else name,
imatrix_data=imatrix_data,
embedding_qtype=embedding_qtype,
model_type=model_type,
model_config=model_config,
torch_dtype=torch_dtype,
enable_xetla=enable_xetla,
)
Expand Down Expand Up @@ -652,17 +653,13 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True,
if optimize_model:
model = _optimize_pre(model)

# mixed quantization needs model_type to choose custom quantization strategy
if hasattr(model, "config"):
model_type = getattr(model.config, "model_type", None)
else:
model_type = None
# mixed quantization needs model_config to choose custom quantization strategy
model, has_been_replaced = _replace_with_low_bit_linear(
model, qtype, modules_to_not_convert,
convert_shape_only, cpu_embedding,
imatrix_data=imatrix_data,
embedding_qtype=embedding_qtype,
model_type=model_type,
model_config=getattr(model, "config", None),
torch_dtype=torch_dtype,
enable_xetla=enable_xetla,
)
Expand Down
16 changes: 14 additions & 2 deletions python/llm/src/ipex_llm/transformers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,8 +267,12 @@ def module_name_process(full_module_name):
return new_module_name, layer, cur_module


def get_cur_qtype_and_imatrix(qtype, full_module_name, imatrix_data, model_type=None):
def get_cur_qtype_and_imatrix(qtype, full_module_name, imatrix_data, model_config=None):
cur_qtype = qtype
if model_config is not None:
model_type = getattr(model_config, "model_type", None)
else:
model_dtype = None
if qtype in [ggml_tensor_qtype["gguf_iq2_xxs"], ggml_tensor_qtype["gguf_iq2_xs"],
ggml_tensor_qtype["gguf_iq1_s"]]:
# For quantization which needs importance matrix
Expand All @@ -281,7 +285,15 @@ def get_cur_qtype_and_imatrix(qtype, full_module_name, imatrix_data, model_type=
elif cur_module == 'down' and int(layer) in [0, 1, 2, 3]:
cur_qtype = ggml_tensor_qtype['q2_k']
else:
if cur_module == 'v' or (cur_module == 'down' and int(layer) in [0, 1, 10, 11]):
num_hidden_layers = getattr(model_config, "num_hidden_layers", None)
hidden_size = getattr(model_config, "hidden_size", None)
if model_type == "llama" and hidden_size == 8192:
# for llama2-70b
if cur_module == 'v':
cur_qtype = ggml_tensor_qtype['sym_int4'] # llama.cpp use q4k here
if cur_module == 'down' and int(layer) < int(num_hidden_layers/8):
cur_qtype = ggml_tensor_qtype['q2_k']
elif cur_module == 'v' or (cur_module == 'down' and int(layer) in [0, 1, 10, 11]):
cur_qtype = ggml_tensor_qtype['q2_k']
if qtype == ggml_tensor_qtype["gguf_iq1_s"] and cur_module == 'o':
cur_qtype = ggml_tensor_qtype['gguf_iq2_xxs']
Expand Down

0 comments on commit bfc1caa

Please sign in to comment.