From a5b63eb5cdf8dd821c45e7184fcc49be8da555e0 Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Fri, 22 Nov 2024 12:18:19 +0800 Subject: [PATCH 1/6] initial commit --- .../LLM/CPP_Examples/convert.py | 2 +- .../src/ipex_llm/transformers/npu_model.py | 8 +- .../npu_pipeline_model/convert_pipeline.py | 64 ++++++++- .../transformers/npu_pipeline_model/qwen.py | 124 +++++++++++++++++- 4 files changed, 179 insertions(+), 19 deletions(-) diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/convert.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/convert.py index 27bb3335967..c88aff51bdc 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/convert.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/convert.py @@ -63,7 +63,7 @@ transpose_value_cache=not args.disable_transpose_value_cache, mixed_precision=True, trust_remote_code=True, - compile_full_model=True, + convert_model=True, save_directory=save_dir) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index 2c96b06e1fe..e57cd0ba760 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -134,7 +134,7 @@ def from_pretrained(cls, *args, **kwargs): mixed_precision = kwargs.pop('mixed_precision', False) quantization_group_size = kwargs.pop("quantization_group_size", 0) mock_device = kwargs.pop('device', None) # For mock on CPU - compile_full_model = kwargs.pop('compile_full_model', False) + convert_model = kwargs.pop('convert_model', False) save_directory = kwargs.pop('save_directory', None) invalidInputError( @@ -202,7 +202,7 @@ def from_pretrained(cls, *args, **kwargs): "inter_pp": inter_pp, "intra_pp": intra_pp, "transpose_value_cache": transpose_value_cache, - "compile_full_model": compile_full_model, + "convert_model": convert_model, "save_directory": save_directory, } model = cls.optimize_npu_model(*args, **optimize_kwargs) @@ -241,7 +241,7 @@ def optimize_npu_model(cls, *args, **kwargs): inter_pp = kwargs.pop("inter_pp", None) intra_pp = kwargs.pop("intra_pp", None) transpose_value_cache = kwargs.pop("transpose_value_cache", True) - compile_full_model = kwargs.pop('compile_full_model', False) + convert_model = kwargs.pop('convert_model', False) save_directory = kwargs.pop('save_directory', None) if hasattr(model, "llm"): @@ -280,7 +280,7 @@ def optimize_npu_model(cls, *args, **kwargs): max_prompt_len=max_prompt_len, transpose_value_cache=transpose_value_cache, group_size=quantization_group_size, - compile_full_model=compile_full_model, + convert_model=convert_model, save_directory=save_directory) model.save_low_bit = types.MethodType(save_low_bit, model) return model diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py index 343d79ee6be..1b20169e774 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py @@ -193,7 +193,7 @@ def convert_llm(model: torch.nn.Module, max_prompt_len: int, transpose_value_cache: bool, group_size: int, - compile_full_model: bool=False, + convert_model: bool=False, save_directory: str=None): # whether to set layernorm weight as const layernorm_const = os.environ.get("IPEX_LLM_LAYERNORM_CONST", "1") == "1" @@ -203,6 +203,16 @@ def convert_llm(model: torch.nn.Module, else: n_splits_linear = model.config.hidden_size // group_size n_splits_down_proj = model.config.intermediate_size // group_size + if convert_model == True: + convert_llm_for_deploy(model, + kv_len, + max_prompt_len, + transpose_value_cache, + n_splits_linear, + n_splits_down_proj, + group_size, + save_directory) + return 0 if model.config.model_type == "llama": with tempfile.TemporaryDirectory() as temp_dir: weight_dir = os.path.join(temp_dir, "model_weights") @@ -340,7 +350,7 @@ def convert_llm(model: torch.nn.Module, from .qwen import convert_qwen_layer, convert_lm_head_and_embedding first_blob_path, last_blob_path = convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir, - compile_full_model) + convert_model) param_list = [] for layer_idx in range(0, layer_num): @@ -350,11 +360,6 @@ def convert_llm(model: torch.nn.Module, with Pool() as pool: result = pool.starmap(convert_qwen_layer, param_list) - if compile_full_model: - convert_qwen_layer(model, 0, n_splits_linear, n_splits_down_proj, - temp_dir, weight_dir, transpose_value_cache, max_prompt_len, - group_size, layernorm_const, "prefill") - # Prefill Runner from ipex_llm.transformers.npu_models.convert_mp import convert_qwen convert_qwen(model, @@ -403,3 +408,48 @@ def convert_llm(model: torch.nn.Module, import types model.generate = types.MethodType(generate, model) return model + + +def convert_llm_for_deploy(model: torch.nn.Module, + kv_len: int, + max_prompt_len: int, + transpose_value_cache: bool, + n_splits_linear: int, + n_splits_down_proj: int, + group_size: int, + save_directory: str=None): + os.mkdir(save_directory) + weight_dir = os.path.join(save_directory, "model_weights") + os.mkdir(weight_dir) + + if model.config.model_type == "qwen2": + layernorm_const =True + if model.config.hidden_size == 1536: + # Qwen2-1.5B-Instruct + fused_layers = 1 + else: + fused_layers = 2 + update_dict = {"kv_len": kv_len, + "num_head": model.model.layers[0].self_attn.num_heads, + "head_dim": model.model.layers[0].self_attn.head_dim, + "transpose_value_cache": transpose_value_cache, + "max_prompt_len": max_prompt_len, + "layernorm_const": layernorm_const, + "group_size": group_size, + "fused_layers": fused_layers} + model.config.update(update_dict) + model.config.save_pretrained(save_directory) + + from .qwen import convert_qwen_layer, convert_fused_qwen_layer + from .qwen import convert_lm_head_and_embedding + # save fused decoder layers's blob + convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj, + save_directory, weight_dir, transpose_value_cache, max_prompt_len, + group_size, layernorm_const, "decode") + # save prefill IR + convert_qwen_layer(model, 0, n_splits_linear, n_splits_down_proj, + save_directory, weight_dir, transpose_value_cache, max_prompt_len, + group_size, layernorm_const, "prefill") + + convert_lm_head_and_embedding(model, n_splits_linear, + save_directory, weight_dir, True) diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py index 8d3966b4ead..59a42f13093 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py @@ -23,7 +23,7 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir, - compile_full_model=False): + convert_model=False): num_heads = model.model.layers[0].self_attn.num_heads head_dim = model.model.layers[0].self_attn.head_dim rms_norm_eps = model.config.rms_norm_eps @@ -60,7 +60,7 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir, ) last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, f"lm_head", - temp_dir, True, True) + temp_dir, True, False) # save weights bins files if not isinstance(lm_head, SlicedLMHead): @@ -83,11 +83,13 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir, dtype=np.float16, input_length=1, ) - first_blob_path = update_names_of_IR_and_export_blob(new_embedding, f"embedding", - temp_dir, True, keep_ir=True) - if compile_full_model: + if convert_model: bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin") embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file) + first_blob_path = True + else: + first_blob_path = update_names_of_IR_and_export_blob(new_embedding, f"embedding", + temp_dir, True, keep_ir=True) return first_blob_path, last_blob_path @@ -138,8 +140,8 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, else: input_len = kv_len decoder_name = "decoder_layer_prefill" - compile = False - keep_ir = True + compile = True + keep_ir = False single_decoder = LowBitQwenMultiDecoderlayer( [1, input_len, num_heads * head_dim], input_layernorm_weights=None, @@ -190,3 +192,111 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, scale.numpy().tofile(bin_file) del single_decoder + + +def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj, + save_dir, weight_dir, transpose_value_cache, kv_len, group_size, + layernorm_const, mode="decode"): + num_heads = model.model.layers[0].self_attn.num_heads + num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads + head_dim = model.model.layers[0].self_attn.head_dim + intermediate_size = model.config.intermediate_size + rms_norm_eps = model.config.rms_norm_eps + layer_num = len(model.model.layers) + fused_layer_num = layer_num // fused_layers + + from ipex_llm.transformers.npu_models.qwen2_mp import LowBitQwenMultiDecoderlayer + for i in range(fused_layers): + layer_start = i * fused_layer_num + layer_end = min((i + 1) * fused_layer_num, layer_num) + layer_weights = [] + input_layer_norm_weights = [] + post_attn_layernorm_weights = [] + q_biases = [] + k_biases = [] + v_biases = [] + layer_indexs = range(layer_start, layer_end) + n_splits_linear = len(model.model.layers[0].mlp.gate_proj_dq_list) + n_splits_down_proj = len(model.model.layers[0].mlp.down_proj_dq_list) + for layer_idx in layer_indexs: + curr_layer = model.model.layers[layer_idx] + attn_layer = curr_layer.self_attn + mlp_layer = curr_layer.mlp + + weights = [] + for layer_list in [attn_layer.q_proj_dq_list, attn_layer.k_proj_dq_list, + attn_layer.v_proj_dq_list, attn_layer.o_proj_dq_list, + mlp_layer.gate_proj_dq_list, mlp_layer.up_proj_dq_list, + mlp_layer.down_proj_dq_list]: + l_weights = [] + scales = [] + for l in layer_list: + l_weights.append(l.weight) + scales.append(l.scale) + weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0))) + + cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16) + cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16) + layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16) + layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16) + + layer_weights.extend(weights) + input_layer_norm_weights.append(layer_norm_0) + post_attn_layernorm_weights.append(layer_norm_1) + q_biases.append(attn_layer.q_proj_dq_list.q_proj_dq_0.bias.to(torch.float16)) + k_biases.append(attn_layer.k_proj_dq_list.k_proj_dq_0.bias.to(torch.float16)) + v_biases.append(attn_layer.v_proj_dq_list.v_proj_dq_0.bias.to(torch.float16)) + + # save weight + input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin") + post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin") + layer_norm_0.data.numpy().tofile(input_lm_bin_file) + layer_norm_1.data.numpy().tofile(post_lm_bin_file) + st_idx = 5 + # 5 / 6 / 7 are bias + q_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx}.bin") + k_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+1}.bin") + v_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+2}.bin") + q_biases[-1].data.numpy().tofile(q_bias_bin_file) + k_biases[-1].data.numpy().tofile(k_bias_bin_file) + v_biases[-1].data.numpy().tofile(v_bias_bin_file) + # 6, 7 are past k/v + for idx, (weight, scale) in enumerate(weights): + bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*2}.bin") + weight.numpy().tofile(bin_file) + bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*2+1}.bin") + scale.numpy().tofile(bin_file) + + if isinstance(weights[0], tuple): + np_dtype = np.int8 if weights[0][0].dtype == torch.int8 else np.uint8 + else: # FP16 Linear + np_dtype = np.float16 + + fused_decoder = LowBitQwenMultiDecoderlayer( + [1, 1, num_heads * head_dim], + input_layernorm_weights=input_layer_norm_weights, + post_attn_layernorm_weights=post_attn_layernorm_weights, + q_biases=q_biases, + k_biases=k_biases, + v_biases=v_biases, + cached_cos=cached_cos, + cached_sin=cached_sin, + num_heads=num_heads, + num_key_value_heads=num_key_value_heads, + num_layers=fused_layer_num, + max_seq_len=kv_len, + rms_norm_eps=rms_norm_eps, + intermediate_size=intermediate_size, + mode=mode, + transpose_value=transpose_value_cache, + dtype=np_dtype, + n_splits_linear=n_splits_linear, + n_splits_down_proj=n_splits_down_proj, + group_size=group_size + ) + update_names_of_IR_and_export_blob(fused_decoder, + f"decoder_layer_{i}", + save_dir, + compile_blob=True, + keep_ir=False) + return 0 From 78a377b3b6e9e444758d5184849452a757c1bee3 Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Fri, 22 Nov 2024 12:27:48 +0800 Subject: [PATCH 2/6] fix --- .../transformers/npu_pipeline_model/convert_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py index 1b20169e774..eef60ff4833 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py @@ -444,7 +444,7 @@ def convert_llm_for_deploy(model: torch.nn.Module, from .qwen import convert_lm_head_and_embedding # save fused decoder layers's blob convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj, - save_directory, weight_dir, transpose_value_cache, max_prompt_len, + save_directory, weight_dir, transpose_value_cache, kv_len, group_size, layernorm_const, "decode") # save prefill IR convert_qwen_layer(model, 0, n_splits_linear, n_splits_down_proj, From 00b3b35a5c8945d394b425fa224ec485ced416ca Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Fri, 22 Nov 2024 12:30:15 +0800 Subject: [PATCH 3/6] fix style --- .../src/ipex_llm/transformers/npu_models/qwen2_mp.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py index 015efe10031..b2cc14c44d6 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py @@ -632,6 +632,16 @@ def run_decode( n_splits_down_proj=n_splits_down_proj, group_size=group_size ) + dir = "D:\\ruonan\\converted-qwen2-1.5B" + if intra_stages > 1: + for i in range(intra_stages): + xml_path = f"Qwen2-1.5B-{len(layer_indexs)}-{intra_stages * (my_rank-1) + i}.xml" + print(os.path.join(dir, xml_path)) + multi_decoder.backend_decoders[i].save(os.path.join(dir, xml_path)) + else: + xml_path = f"Qwen2-1.5B-{len(layer_indexs)}-{(my_rank-1)}.xml" + print(os.path.join(dir, xml_path)) + multi_decoder.backend_decoders[0].save(os.path.join(dir, xml_path)) dist.barrier() From 453bc75d5c5067567c4dd9dd13aa247140a736dc Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Fri, 22 Nov 2024 12:32:19 +0800 Subject: [PATCH 4/6] fix style --- .../transformers/npu_pipeline_model/convert_pipeline.py | 6 +++--- .../src/ipex_llm/transformers/npu_pipeline_model/qwen.py | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py index eef60ff4833..d5064a8f852 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py @@ -203,7 +203,7 @@ def convert_llm(model: torch.nn.Module, else: n_splits_linear = model.config.hidden_size // group_size n_splits_down_proj = model.config.intermediate_size // group_size - if convert_model == True: + if convert_model: convert_llm_for_deploy(model, kv_len, max_prompt_len, @@ -423,9 +423,9 @@ def convert_llm_for_deploy(model: torch.nn.Module, os.mkdir(weight_dir) if model.config.model_type == "qwen2": - layernorm_const =True + layernorm_const = True if model.config.hidden_size == 1536: - # Qwen2-1.5B-Instruct + # Qwen2-1.5B-Instruct fused_layers = 1 else: fused_layers = 2 diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py index 59a42f13093..61233731404 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py @@ -207,7 +207,7 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down from ipex_llm.transformers.npu_models.qwen2_mp import LowBitQwenMultiDecoderlayer for i in range(fused_layers): - layer_start = i * fused_layer_num + layer_start = i * fused_layer_num layer_end = min((i + 1) * fused_layer_num, layer_num) layer_weights = [] input_layer_norm_weights = [] @@ -264,7 +264,8 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down for idx, (weight, scale) in enumerate(weights): bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*2}.bin") weight.numpy().tofile(bin_file) - bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*2+1}.bin") + bin_file = os.path.join(weight_dir, + f"model_{layer_idx}_input_{st_idx+3+idx*2+1}.bin") scale.numpy().tofile(bin_file) if isinstance(weights[0], tuple): From 60c8851d36a377461b718828ed37817c924e01f9 Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Fri, 22 Nov 2024 12:36:06 +0800 Subject: [PATCH 5/6] fix --- .../src/ipex_llm/transformers/npu_models/qwen2_mp.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py index b2cc14c44d6..015efe10031 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py @@ -632,16 +632,6 @@ def run_decode( n_splits_down_proj=n_splits_down_proj, group_size=group_size ) - dir = "D:\\ruonan\\converted-qwen2-1.5B" - if intra_stages > 1: - for i in range(intra_stages): - xml_path = f"Qwen2-1.5B-{len(layer_indexs)}-{intra_stages * (my_rank-1) + i}.xml" - print(os.path.join(dir, xml_path)) - multi_decoder.backend_decoders[i].save(os.path.join(dir, xml_path)) - else: - xml_path = f"Qwen2-1.5B-{len(layer_indexs)}-{(my_rank-1)}.xml" - print(os.path.join(dir, xml_path)) - multi_decoder.backend_decoders[0].save(os.path.join(dir, xml_path)) dist.barrier() From 1a6f9b7f16732e0548528dd2472436208be800e8 Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Fri, 22 Nov 2024 14:20:09 +0800 Subject: [PATCH 6/6] fix --- .../transformers/npu_pipeline_model/convert_pipeline.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py index d5064a8f852..2ab7d72b929 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py @@ -442,14 +442,14 @@ def convert_llm_for_deploy(model: torch.nn.Module, from .qwen import convert_qwen_layer, convert_fused_qwen_layer from .qwen import convert_lm_head_and_embedding - # save fused decoder layers's blob + # save fused_layers blobs of fused decoder layers convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down_proj, save_directory, weight_dir, transpose_value_cache, kv_len, group_size, layernorm_const, "decode") - # save prefill IR + # save blob of single prefill layer convert_qwen_layer(model, 0, n_splits_linear, n_splits_down_proj, save_directory, weight_dir, transpose_value_cache, max_prompt_len, group_size, layernorm_const, "prefill") - + # save blob of lmhead and bin of embedding convert_lm_head_and_embedding(model, n_splits_linear, save_directory, weight_dir, True)