diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index 9744e2f85f1..983c6393496 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -284,29 +284,29 @@ def optimize_npu_model(cls, *args, **kwargs): model.share_memory() if not pipeline: - if (not hasattr(model, 'llm') and - model.config.model_type in ["qwen2", "llama", "minicpm"]): - from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process - optimize_llm_single_process( - llm, - kv_len=max_context_len, - max_prompt_len=max_prompt_len, - transpose_value_cache=transpose_value_cache, - group_size=quantization_group_size, - qtype=qtype, - save_directory=save_directory, - fuse_layers=fuse_layers - ) - else: - optimize_llm( - llm, - max_context_len=max_context_len, - max_prompt_len=max_prompt_len, - inter_pp=inter_pp, - intra_pp=intra_pp, - transpose_value_cache=transpose_value_cache, - group_size=quantization_group_size - ) + # if (not hasattr(model, 'llm') and + # model.config.model_type in ["qwen2", "llama", "minicpm"]): + # from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process + # optimize_llm_single_process( + # llm, + # kv_len=max_context_len, + # max_prompt_len=max_prompt_len, + # transpose_value_cache=transpose_value_cache, + # group_size=quantization_group_size, + # qtype=qtype, + # save_directory=save_directory, + # fuse_layers=fuse_layers + # ) + # else: + optimize_llm( + llm, + max_context_len=max_context_len, + max_prompt_len=max_prompt_len, + inter_pp=inter_pp, + intra_pp=intra_pp, + transpose_value_cache=transpose_value_cache, + group_size=quantization_group_size + ) else: from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \ import convert_llm diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert.py b/python/llm/src/ipex_llm/transformers/npu_models/convert.py index 2842799b160..c5dc8c56ffd 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/convert.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/convert.py @@ -485,6 +485,7 @@ def prepare_input_ids( } return model_inputs +token = 0 def causal_lm_forward( self, @@ -505,12 +506,19 @@ def causal_lm_forward( else: input_list = input_ids[0] input_length = len(input_list) + global token if input_length > 1: logits = run_prefill_with_logits(self.model_ptr, input_list, self.logits_buffer, self.vocab_size) + filename = rf"D:\ruonan\debug log\python cpp\decode_logits_{token}.bin" + logits.numpy().tofile(filename) + token += 1 else: logits = run_decode_with_logits(self.model_ptr, input_list[0], self.logits_buffer, self.vocab_size) + filename = rf"D:\ruonan\debug log\python cpp\decode_logits_{token}.bin" + logits.numpy().tofile(filename) + token += 1 return CausalLMOutputWithPast( loss=None, diff --git a/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py b/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py index a1dac609243..42cf72e353c 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py @@ -225,6 +225,7 @@ def attention(self, head_dim=head_dim, ) new_key_states = key_states + new_value_states = value_states if mode == "decode": key_states = self.concat(past_key, key_states, axis=-2) diff --git a/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py index 397739cb72a..a78c1a8afb2 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py @@ -116,6 +116,9 @@ def __init__( self.cached_sin = cached_sin self.batch_size, self.seq_len, self.hidden_size = hidden_shape self.mode = mode + if mode == "prefill": + print(" rms_norm_eps is ", rms_norm_eps) + print(" n_splits_down_proj is ", n_splits_down_proj) self.rms_norm_eps = rms_norm_eps self.transpose_value = transpose_value self.num_layers = num_layers @@ -234,15 +237,25 @@ def __init__( new_value_states = self.convert_to_fp16(curr_key_values[i][1]) print(f"{mode} start compiling") - if ( - group_size != 0 - and (mode == "prefill" or num_layers == 2 or num_layers == 3) - and os.environ.get("IPEX_LLM_NPU_DISABLE_COMPILE_OPT", "0") != "1" - ): - self.compile(npu_dpu_groups=6) - else: - self.compile() + # if ( + # group_size != 0 + # and (mode == "prefill" or num_layers == 2 or num_layers == 3) + # and os.environ.get("IPEX_LLM_NPU_DISABLE_COMPILE_OPT", "0") != "1" + # ): + # self.compile(npu_dpu_groups=6) + # else: + # self.compile() + self.compile() print(f"{mode} end compiling") + # if mode == "prefill": + # # # path = r"D:\\ruonan\\debug log\\python cpp\\prefill_layer.xml" + # path = r"D:\\ruonan\\debug log\\acc lib\\prefill_layer.xml" + # print("save Prefill IR!!!!!") + # self.save(path) + # # path = r"D:\\ruonan\\debug log\\python cpp\\prefill_layer.blob" + # path = r"D:\\ruonan\\debug log\\acc lib\\prefill_layer.blob" + # self.saveCompiledModel(path) + def build_decoder( self, @@ -416,6 +429,20 @@ def forward( for i in range(self.intra_stages): start, end = self.layer_ranges[i] self.backend_decoders[i].update_cache(past_key_value, self.layer_indexes[start:end]) + for j in range(start, end): + key_ = past_key_value.key_cache[self.layer_indexes[j]] # shape is [1, 32, 28, 128] + val_ = past_key_value.value_cache[self.layer_indexes[j]] + new_size = ( + key_.size(0), + key_.size(1), + self.max_seq_len, + key_.size(3), + ) + key = key_.as_strided(new_size, key_.stride(), storage_offset=0) + print(val_.shape, val_.stride()) + val = val_.as_strided(new_size, val_.stride(), storage_offset=0) + key.numpy().tofile(rf"D:\ruonan\debug log\acc lib\forward_input_key_{self.layer_indexes[j]}.bin") + val.numpy().tofile(rf"D:\ruonan\debug log\acc lib\forward_input_value_{self.layer_indexes[j]}.bin") hidden_states, new_keys, new_values = LowBitQwenMultiDecoderlayer.run_decoders( inputs, @@ -535,7 +562,7 @@ def forward( inputs += (self.layer_norm_0, self.layer_norm_1) inputs += (self.q_bias, self.k_bias, self.v_bias) hidden_states, past_key, past_value = run_model( - inputs, self.op_parameters, backend_cls, self.op_id, replica=2 + inputs, self.op_parameters, backend_cls, self.op_id, replica=1 ) cache_kwargs = {"max_seq_len": self.max_seq_len, "transpose": self.transpose_value} key_states, value_states = past_key_value.update( @@ -878,7 +905,18 @@ def run_prefill( group_size=group_size, asym=asym ) - + if layer_idx == 0: + save_idx = 3 + for w in [layer_norm_0, layer_norm_1, attn_layer.q_proj_dq_list.q_proj_dq_0.bias.to(torch.float16), attn_layer.k_proj_dq_list.k_proj_dq_0.bias.to(torch.float16), + attn_layer.v_proj_dq_list.v_proj_dq_0.bias.to(torch.float16), *weights]: + if isinstance(w, torch.Tensor): + w.detach().to(torch.float16).numpy().tofile(rf"D:\ruonan\debug log\acc lib\prefill_input_{save_idx}.bin") + save_idx += 1 + elif len(w) == 2: + w[0].detach().to(torch.uint8).numpy().tofile(rf"D:\ruonan\debug log\acc lib\prefill_input_{save_idx}.bin") + save_idx += 1 + w[1].detach().to(torch.float16).numpy().tofile(rf"D:\ruonan\debug log\acc lib\prefill_input_{save_idx}.bin") + save_idx += 1 layer_weights.extend(weights) input_layer_norm_weights.append(layer_norm_0) post_attn_layernorm_weights.append(layer_norm_1) @@ -895,8 +933,13 @@ def run_prefill( break hidden_states, position_ids, causal_mask, past_key_values = result + + hidden_states.to(torch.float16).numpy().tofile(rf"D:\ruonan\debug log\acc lib\prefill_input_0.bin") + causal_mask.to(torch.float16).numpy().tofile(rf"D:\ruonan\debug log\acc lib\prefill_input_1.bin") + position_ids.to(torch.int64).numpy().tofile(rf"D:\ruonan\debug log\acc lib\prefill_input_2.bin") + with torch.inference_mode(): - for decoder_layer in deocderlayers: + for idx, decoder_layer in enumerate(deocderlayers): layer_outputs = decoder_layer( hidden_states, attention_mask=causal_mask, @@ -908,6 +951,10 @@ def run_prefill( hidden_states = layer_outputs[0] next_decoder_cache = layer_outputs[1] + if idx == 0: + hidden_states.to(torch.float16).numpy().tofile(rf"D:\ruonan\debug log\acc lib\prefill_output_0.bin") + next_decoder_cache.key_cache[0].to(torch.float16).numpy().tofile(rf"D:\ruonan\debug log\acc lib\prefill_output_1.bin") + next_decoder_cache.value_cache[0].to(torch.float16).numpy().tofile(rf"D:\ruonan\debug log\acc lib\prefill_output_2.bin") result_queue.put((hidden_states, next_decoder_cache)) @@ -1109,6 +1156,7 @@ def qwen2_fused_model_forward( return qwen2_fused_model_forward +token = 0 def qwen2_casullm_forward( self, @@ -1152,6 +1200,10 @@ def qwen2_casullm_forward( # ipex-llm change end logits = self.lm_head(hidden_states) logits = logits.float() + global token + filename = rf"D:\ruonan\debug log\acc lib\decode_logits_{token}.bin" + logits.numpy().tofile(filename) + token += 1 loss = None if labels is not None: diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py index 87459a99e98..5e3130e1d53 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py @@ -27,7 +27,12 @@ def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True npu_dpu_groups=None): xml_path = os.path.join(dir, model_name + ".xml") bin_path = os.path.join(dir, model_name + ".bin") - model.save(xml_path) + # model.save(xml_path) + if model_name != "decoder_layer_prefill": + model.save(xml_path) + else: + print("read D:\\ruonan\debug log\\acc lib\\prefill_layer.xml") + xml_path = r"D:\\ruonan\\debug log\\acc lib\\prefill_layer.xml" new_ir_path = os.path.join(dir, model_name + "_new.xml") new_bin_path = os.path.join(dir, model_name + "_new.bin") blob_path = os.path.join(dir, model_name + ".blob") @@ -35,13 +40,13 @@ def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True core = Core() core.set_property("NPU", {"NPU_COMPILATION_MODE_PARAMS": "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add"}) - core.set_property("NPU", {"PERFORMANCE_HINT": "LATENCY"}) + # core.set_property("NPU", {"PERFORMANCE_HINT": "LATENCY"}) if ( npu_dpu_groups is not None and os.environ.get("IPEX_LLM_NPU_DISABLE_COMPILE_OPT", "0") != "1" ): core.set_property("NPU", {"NPU_DPU_GROUPS": str(npu_dpu_groups)}) - + print(xml_path) model = core.read_model(xml_path) inputs = model.inputs for idx, input in enumerate(inputs): @@ -61,7 +66,7 @@ def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True with open(blob_path, 'wb') as f: f.write(model_stream) - os.remove(xml_path) + # os.remove(xml_path) if not keep_ir: os.remove(new_ir_path) diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py index bb8003f06a7..92f4ebbe135 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py @@ -172,7 +172,9 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, else: input_len = kv_len decoder_name = "decoder_layer_prefill" - npu_dpu_groups = 6 + # npu_dpu_groups = 6 + npu_dpu_groups = None + print("prefill npu dpu groups : ", npu_dpu_groups) single_decoder = LowBitQwenMultiDecoderlayer( [1, input_len, num_heads * head_dim], @@ -199,7 +201,7 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, ) rest_blob_path = update_names_of_IR_and_export_blob(single_decoder, decoder_name, - temp_dir, True, False, + temp_dir, True, True, npu_dpu_groups=npu_dpu_groups) # 0, 1, 2 are input_embed/attention_mask/position_id