diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index 9c55a1d5382..ac86aa269cc 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -239,6 +239,8 @@ def optimize_npu_model(cls, *args, **kwargs): inter_pp = kwargs.pop("inter_pp", None) intra_pp = kwargs.pop("intra_pp", None) transpose_value_cache = kwargs.pop("transpose_value_cache", True) + compile_full_model = kwargs.pop('compile_full_model', False) + save_directory = kwargs.pop('save_directory', None) if hasattr(model, "llm"): llm = model.llm diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py index 4834e1de45b..8806e64d1c9 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py @@ -56,7 +56,7 @@ def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True f.write(model_stream) os.remove(xml_path) - os.remove(bin_path) + # os.remove(bin_path) if not keep_ir: os.remove(new_ir_path) diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py index 31be6e2a1e5..bc1acf3f21c 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py @@ -350,9 +350,9 @@ def convert_llm(model: torch.nn.Module, result = pool.starmap(convert_qwen_layer, param_list) if compile_full_model: - from .qwen import convert_qwen_prefill_layer, convert_lm_head_and_embedding - convert_qwen_prefill_layer(model, n_splits_linear, n_splits_down_proj, - temp_dir, weight_dir, transpose_value_cache, max_prompt_len, group_size) + convert_qwen_layer(model, 0, n_splits_linear, n_splits_down_proj, + temp_dir, weight_dir, transpose_value_cache, max_prompt_len, group_size, + "prefill") convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir, max_prompt_len) diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py index 92e574fb23a..e82a0ac18e6 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py @@ -85,18 +85,19 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir, ) suffix = "_prefill" if input_length > 1 else "" compile = False if input_length > 1 else True - if input_length == 0: + if input_length == 1: first_blob_path = update_names_of_IR_and_export_blob(new_embedding, f"embedding{suffix}", temp_dir, compile, keep_ir=False) - if input_length > 1: + else: bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin") embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file) + first_blob_path = None return first_blob_path, last_blob_path def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, temp_dir, weight_dir, transpose_value_cache, kv_len, group_size, - layernorm_const): + layernorm_const, mode="decode"): num_heads = model.model.layers[0].self_attn.num_heads num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads head_dim = model.model.layers[0].self_attn.head_dim @@ -133,162 +134,84 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, else: # FP16 Linear np_dtype = np.float16 - single_decoder = LowBitQwenMultiDecoderlayer( - [1, 1, num_heads * head_dim], - input_layernorm_weights=[layer_norm_0] if layernorm_const else None, - post_attn_layernorm_weights=[layer_norm_1] if layernorm_const else None, - q_biases=None, - k_biases=None, - v_biases=None, - cached_cos=cached_cos, - cached_sin=cached_sin, - num_heads=num_heads, - num_key_value_heads=num_key_value_heads, - num_layers=1, - max_seq_len=kv_len, - rms_norm_eps=rms_norm_eps, - intermediate_size=intermediate_size, - mode="decode", - transpose_value=transpose_value_cache, - dtype=np_dtype, - n_splits_linear=n_splits_linear, - n_splits_down_proj=n_splits_down_proj, - group_size=group_size - ) - rest_blob_path = update_names_of_IR_and_export_blob(single_decoder, - f"decoder_layer_{layer_idx}", - temp_dir) - bin_path = os.path.join(temp_dir, f"decoder_layer_{layer_idx}" + ".bin") - os.remove(bin_path) - - # 0, 1, 2 are input_embed/attention_mask/position_id - if layernorm_const: - st_idx = 3 - else: - input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin") - post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin") - layer_norm_0.data.numpy().tofile(input_lm_bin_file) - layer_norm_1.data.numpy().tofile(post_lm_bin_file) - st_idx = 5 - q_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx}.bin") - k_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+1}.bin") - v_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+2}.bin") - q_bias.data.numpy().tofile(q_bias_bin_file) - k_bias.data.numpy().tofile(k_bias_bin_file) - v_bias.data.numpy().tofile(v_bias_bin_file) - # 6, 7 are past k/v - for idx, (weight, scale) in enumerate(weights): - bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+5+idx*2}.bin") - weight.numpy().tofile(bin_file) - bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+5+idx*2+1}.bin") - scale.numpy().tofile(bin_file) - - del single_decoder - - -def convert_qwen_prefill_layer(model, n_splits_linear, n_splits_down_proj, - temp_dir, weight_dir, transpose_value_cache, kv_len, group_size): - layer_idx = 0 - num_heads = model.model.layers[0].self_attn.num_heads - num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads - head_dim = model.model.layers[0].self_attn.head_dim - intermediate_size = model.config.intermediate_size - rms_norm_eps = model.config.rms_norm_eps - - from ipex_llm.transformers.npu_models.qwen2_mp import LowBitQwenMultiDecoderlayer - curr_layer = model.model.layers[layer_idx] - attn_layer = curr_layer.self_attn - mlp_layer = curr_layer.mlp - - weights = [] - if n_splits_linear == 1: - for q, k, v, o, g, u in zip(attn_layer.q_proj_dq_list, - attn_layer.k_proj_dq_list, - attn_layer.v_proj_dq_list, - attn_layer.o_proj_dq_list, - mlp_layer.gate_proj_dq_list, - mlp_layer.up_proj_dq_list): - weights.append((q.weight, q.scale)) - weights.append((k.weight, k.scale)) - weights.append((v.weight, v.scale)) - weights.append((o.weight, o.scale)) - weights.append((g.weight, g.scale)) - weights.append((u.weight, u.scale)) + if mode == "decode": + single_decoder = LowBitQwenMultiDecoderlayer( + [1, 1, num_heads * head_dim], + input_layernorm_weights=[layer_norm_0] if layernorm_const else None, + post_attn_layernorm_weights=[layer_norm_1] if layernorm_const else None, + q_biases=None, + k_biases=None, + v_biases=None, + cached_cos=cached_cos, + cached_sin=cached_sin, + num_heads=num_heads, + num_key_value_heads=num_key_value_heads, + num_layers=1, + max_seq_len=kv_len, + rms_norm_eps=rms_norm_eps, + intermediate_size=intermediate_size, + mode="decode", + transpose_value=transpose_value_cache, + dtype=np_dtype, + n_splits_linear=n_splits_linear, + n_splits_down_proj=n_splits_down_proj, + group_size=group_size + ) else: - for layer_list in [attn_layer.q_proj_dq_list, attn_layer.k_proj_dq_list, - attn_layer.v_proj_dq_list, attn_layer.o_proj_dq_list, - mlp_layer.gate_proj_dq_list, mlp_layer.up_proj_dq_list]: - l_weights = [] - scales = [] - for l in layer_list: - l_weights.append(l.weight) - scales.append(l.scale) - weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0))) - - if n_splits_down_proj == 1: - for l in mlp_layer.down_proj_dq_list: - weights.append((l.weight, l.scale)) + single_decoder = LowBitQwenMultiDecoderlayer( + [1, kv_len, num_heads * head_dim], + input_layernorm_weights=None, + post_attn_layernorm_weights=None, + q_biases=None, + k_biases=None, + v_biases=None, + cached_cos=cached_cos, + cached_sin=cached_sin, + num_heads=num_heads, + num_key_value_heads=num_key_value_heads, + num_layers=1, + max_seq_len=kv_len, + rms_norm_eps=rms_norm_eps, + intermediate_size=intermediate_size, + mode="prefill", + transpose_value=transpose_value_cache, + dtype=np_dtype, + n_splits_linear=n_splits_linear, + n_splits_down_proj=n_splits_down_proj, + group_size=group_size + ) + if mode == "decode": + rest_blob_path = update_names_of_IR_and_export_blob(single_decoder, + f"decoder_layer_{layer_idx}", + temp_dir) else: - l_weights = [] - scales = [] - for l in mlp_layer.down_proj_dq_list: - l_weights.append(l.weight) - scales.append(l.scale) - weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0))) - - q_bias = attn_layer.q_proj_dq_list.q_proj_dq_0.bias.to(torch.float16) - k_bias = attn_layer.k_proj_dq_list.k_proj_dq_0.bias.to(torch.float16) - v_bias = attn_layer.v_proj_dq_list.v_proj_dq_0.bias.to(torch.float16) - cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16) - cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16) - layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16) - layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16) - - if isinstance(weights[0], tuple): - np_dtype = np.int8 if weights[0][0].dtype == torch.int8 else np.uint8 - else: # FP16 Linear - np_dtype = np.float16 - - single_decoder = LowBitQwenMultiDecoderlayer( - [1, kv_len, num_heads * head_dim], - input_layernorm_weights=None, - post_attn_layernorm_weights=None, - q_biases=None, - k_biases=None, - v_biases=None, - cached_cos=cached_cos, - cached_sin=cached_sin, - num_heads=num_heads, - num_key_value_heads=num_key_value_heads, - num_layers=1, - max_seq_len=kv_len, - rms_norm_eps=rms_norm_eps, - intermediate_size=intermediate_size, - mode="prefill", - transpose_value=transpose_value_cache, - dtype=np_dtype, - n_splits_linear=n_splits_linear, - n_splits_down_proj=n_splits_down_proj, - group_size=group_size - ) - rest_blob_path = update_names_of_IR_and_export_blob(single_decoder, - "decoder_layer_prefill", - temp_dir, - False) - bin_path = os.path.join(temp_dir, "decoder_layer_prefill" + ".bin") + rest_blob_path = update_names_of_IR_and_export_blob(single_decoder, + f"decoder_layer_prefill", + temp_dir) + bin_path = os.path.join(temp_dir, f"decoder_layer_{layer_idx}" + ".bin") os.remove(bin_path) # 0, 1, 2 are input_embed/attention_mask/position_id - input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin") - post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin") - layer_norm_0.data.numpy().tofile(input_lm_bin_file) - layer_norm_1.data.numpy().tofile(post_lm_bin_file) - st_idx = 5 - q_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx}.bin") - k_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+1}.bin") - v_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+2}.bin") - q_bias.data.numpy().tofile(q_bias_bin_file) - k_bias.data.numpy().tofile(k_bias_bin_file) - v_bias.data.numpy().tofile(v_bias_bin_file) + if mode == "decode": + if layernorm_const: + st_idx = 3 + else: + input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin") + post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin") + layer_norm_0.data.numpy().tofile(input_lm_bin_file) + layer_norm_1.data.numpy().tofile(post_lm_bin_file) + st_idx = 5 + q_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx}.bin") + k_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+1}.bin") + v_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+2}.bin") + q_bias.data.numpy().tofile(q_bias_bin_file) + k_bias.data.numpy().tofile(k_bias_bin_file) + v_bias.data.numpy().tofile(v_bias_bin_file) + # 6, 7 are past k/v + for idx, (weight, scale) in enumerate(weights): + bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+5+idx*2}.bin") + weight.numpy().tofile(bin_file) + bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+5+idx*2+1}.bin") + scale.numpy().tofile(bin_file) del single_decoder