Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

debug output of qwen2 #12514

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 23 additions & 23 deletions python/llm/src/ipex_llm/transformers/npu_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,29 +284,29 @@ def optimize_npu_model(cls, *args, **kwargs):
model.share_memory()

if not pipeline:
if (not hasattr(model, 'llm') and
model.config.model_type in ["qwen2", "llama", "minicpm"]):
from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
optimize_llm_single_process(
llm,
kv_len=max_context_len,
max_prompt_len=max_prompt_len,
transpose_value_cache=transpose_value_cache,
group_size=quantization_group_size,
qtype=qtype,
save_directory=save_directory,
fuse_layers=fuse_layers
)
else:
optimize_llm(
llm,
max_context_len=max_context_len,
max_prompt_len=max_prompt_len,
inter_pp=inter_pp,
intra_pp=intra_pp,
transpose_value_cache=transpose_value_cache,
group_size=quantization_group_size
)
# if (not hasattr(model, 'llm') and
# model.config.model_type in ["qwen2", "llama", "minicpm"]):
# from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
# optimize_llm_single_process(
# llm,
# kv_len=max_context_len,
# max_prompt_len=max_prompt_len,
# transpose_value_cache=transpose_value_cache,
# group_size=quantization_group_size,
# qtype=qtype,
# save_directory=save_directory,
# fuse_layers=fuse_layers
# )
# else:
optimize_llm(
llm,
max_context_len=max_context_len,
max_prompt_len=max_prompt_len,
inter_pp=inter_pp,
intra_pp=intra_pp,
transpose_value_cache=transpose_value_cache,
group_size=quantization_group_size
)
else:
from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
import convert_llm
Expand Down
8 changes: 8 additions & 0 deletions python/llm/src/ipex_llm/transformers/npu_models/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,7 @@ def prepare_input_ids(
}
return model_inputs

token = 0

def causal_lm_forward(
self,
Expand All @@ -505,12 +506,19 @@ def causal_lm_forward(
else:
input_list = input_ids[0]
input_length = len(input_list)
global token
if input_length > 1:
logits = run_prefill_with_logits(self.model_ptr, input_list,
self.logits_buffer, self.vocab_size)
filename = rf"D:\ruonan\debug log\python cpp\decode_logits_{token}.bin"
logits.numpy().tofile(filename)
token += 1
else:
logits = run_decode_with_logits(self.model_ptr, input_list[0],
self.logits_buffer, self.vocab_size)
filename = rf"D:\ruonan\debug log\python cpp\decode_logits_{token}.bin"
logits.numpy().tofile(filename)
token += 1

return CausalLMOutputWithPast(
loss=None,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ def attention(self,
head_dim=head_dim,
)
new_key_states = key_states
new_value_states = value_states

if mode == "decode":
key_states = self.concat(past_key, key_states, axis=-2)
Expand Down
74 changes: 63 additions & 11 deletions python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@ def __init__(
self.cached_sin = cached_sin
self.batch_size, self.seq_len, self.hidden_size = hidden_shape
self.mode = mode
if mode == "prefill":
print(" rms_norm_eps is ", rms_norm_eps)
print(" n_splits_down_proj is ", n_splits_down_proj)
self.rms_norm_eps = rms_norm_eps
self.transpose_value = transpose_value
self.num_layers = num_layers
Expand Down Expand Up @@ -234,15 +237,25 @@ def __init__(
new_value_states = self.convert_to_fp16(curr_key_values[i][1])

print(f"{mode} start compiling")
if (
group_size != 0
and (mode == "prefill" or num_layers == 2 or num_layers == 3)
and os.environ.get("IPEX_LLM_NPU_DISABLE_COMPILE_OPT", "0") != "1"
):
self.compile(npu_dpu_groups=6)
else:
self.compile()
# if (
# group_size != 0
# and (mode == "prefill" or num_layers == 2 or num_layers == 3)
# and os.environ.get("IPEX_LLM_NPU_DISABLE_COMPILE_OPT", "0") != "1"
# ):
# self.compile(npu_dpu_groups=6)
# else:
# self.compile()
self.compile()
print(f"{mode} end compiling")
# if mode == "prefill":
# # # path = r"D:\\ruonan\\debug log\\python cpp\\prefill_layer.xml"
# path = r"D:\\ruonan\\debug log\\acc lib\\prefill_layer.xml"
# print("save Prefill IR!!!!!")
# self.save(path)
# # path = r"D:\\ruonan\\debug log\\python cpp\\prefill_layer.blob"
# path = r"D:\\ruonan\\debug log\\acc lib\\prefill_layer.blob"
# self.saveCompiledModel(path)


def build_decoder(
self,
Expand Down Expand Up @@ -416,6 +429,20 @@ def forward(
for i in range(self.intra_stages):
start, end = self.layer_ranges[i]
self.backend_decoders[i].update_cache(past_key_value, self.layer_indexes[start:end])
for j in range(start, end):
key_ = past_key_value.key_cache[self.layer_indexes[j]] # shape is [1, 32, 28, 128]
val_ = past_key_value.value_cache[self.layer_indexes[j]]
new_size = (
key_.size(0),
key_.size(1),
self.max_seq_len,
key_.size(3),
)
key = key_.as_strided(new_size, key_.stride(), storage_offset=0)
print(val_.shape, val_.stride())
val = val_.as_strided(new_size, val_.stride(), storage_offset=0)
key.numpy().tofile(rf"D:\ruonan\debug log\acc lib\forward_input_key_{self.layer_indexes[j]}.bin")
val.numpy().tofile(rf"D:\ruonan\debug log\acc lib\forward_input_value_{self.layer_indexes[j]}.bin")

hidden_states, new_keys, new_values = LowBitQwenMultiDecoderlayer.run_decoders(
inputs,
Expand Down Expand Up @@ -535,7 +562,7 @@ def forward(
inputs += (self.layer_norm_0, self.layer_norm_1)
inputs += (self.q_bias, self.k_bias, self.v_bias)
hidden_states, past_key, past_value = run_model(
inputs, self.op_parameters, backend_cls, self.op_id, replica=2
inputs, self.op_parameters, backend_cls, self.op_id, replica=1
)
cache_kwargs = {"max_seq_len": self.max_seq_len, "transpose": self.transpose_value}
key_states, value_states = past_key_value.update(
Expand Down Expand Up @@ -878,7 +905,18 @@ def run_prefill(
group_size=group_size,
asym=asym
)

if layer_idx == 0:
save_idx = 3
for w in [layer_norm_0, layer_norm_1, attn_layer.q_proj_dq_list.q_proj_dq_0.bias.to(torch.float16), attn_layer.k_proj_dq_list.k_proj_dq_0.bias.to(torch.float16),
attn_layer.v_proj_dq_list.v_proj_dq_0.bias.to(torch.float16), *weights]:
if isinstance(w, torch.Tensor):
w.detach().to(torch.float16).numpy().tofile(rf"D:\ruonan\debug log\acc lib\prefill_input_{save_idx}.bin")
save_idx += 1
elif len(w) == 2:
w[0].detach().to(torch.uint8).numpy().tofile(rf"D:\ruonan\debug log\acc lib\prefill_input_{save_idx}.bin")
save_idx += 1
w[1].detach().to(torch.float16).numpy().tofile(rf"D:\ruonan\debug log\acc lib\prefill_input_{save_idx}.bin")
save_idx += 1
layer_weights.extend(weights)
input_layer_norm_weights.append(layer_norm_0)
post_attn_layernorm_weights.append(layer_norm_1)
Expand All @@ -895,8 +933,13 @@ def run_prefill(
break

hidden_states, position_ids, causal_mask, past_key_values = result

hidden_states.to(torch.float16).numpy().tofile(rf"D:\ruonan\debug log\acc lib\prefill_input_0.bin")
causal_mask.to(torch.float16).numpy().tofile(rf"D:\ruonan\debug log\acc lib\prefill_input_1.bin")
position_ids.to(torch.int64).numpy().tofile(rf"D:\ruonan\debug log\acc lib\prefill_input_2.bin")

with torch.inference_mode():
for decoder_layer in deocderlayers:
for idx, decoder_layer in enumerate(deocderlayers):
layer_outputs = decoder_layer(
hidden_states,
attention_mask=causal_mask,
Expand All @@ -908,6 +951,10 @@ def run_prefill(

hidden_states = layer_outputs[0]
next_decoder_cache = layer_outputs[1]
if idx == 0:
hidden_states.to(torch.float16).numpy().tofile(rf"D:\ruonan\debug log\acc lib\prefill_output_0.bin")
next_decoder_cache.key_cache[0].to(torch.float16).numpy().tofile(rf"D:\ruonan\debug log\acc lib\prefill_output_1.bin")
next_decoder_cache.value_cache[0].to(torch.float16).numpy().tofile(rf"D:\ruonan\debug log\acc lib\prefill_output_2.bin")
result_queue.put((hidden_states, next_decoder_cache))


Expand Down Expand Up @@ -1109,6 +1156,7 @@ def qwen2_fused_model_forward(

return qwen2_fused_model_forward

token = 0

def qwen2_casullm_forward(
self,
Expand Down Expand Up @@ -1152,6 +1200,10 @@ def qwen2_casullm_forward(
# ipex-llm change end
logits = self.lm_head(hidden_states)
logits = logits.float()
global token
filename = rf"D:\ruonan\debug log\acc lib\decode_logits_{token}.bin"
logits.numpy().tofile(filename)
token += 1

loss = None
if labels is not None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,21 +27,26 @@ def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True
npu_dpu_groups=None):
xml_path = os.path.join(dir, model_name + ".xml")
bin_path = os.path.join(dir, model_name + ".bin")
model.save(xml_path)
# model.save(xml_path)
if model_name != "decoder_layer_prefill":
model.save(xml_path)
else:
print("read D:\\ruonan\debug log\\acc lib\\prefill_layer.xml")
xml_path = r"D:\\ruonan\\debug log\\acc lib\\prefill_layer.xml"
new_ir_path = os.path.join(dir, model_name + "_new.xml")
new_bin_path = os.path.join(dir, model_name + "_new.bin")
blob_path = os.path.join(dir, model_name + ".blob")

core = Core()
core.set_property("NPU", {"NPU_COMPILATION_MODE_PARAMS":
"compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add"})
core.set_property("NPU", {"PERFORMANCE_HINT": "LATENCY"})
# core.set_property("NPU", {"PERFORMANCE_HINT": "LATENCY"})
if (
npu_dpu_groups is not None
and os.environ.get("IPEX_LLM_NPU_DISABLE_COMPILE_OPT", "0") != "1"
):
core.set_property("NPU", {"NPU_DPU_GROUPS": str(npu_dpu_groups)})

print(xml_path)
model = core.read_model(xml_path)
inputs = model.inputs
for idx, input in enumerate(inputs):
Expand All @@ -61,7 +66,7 @@ def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True
with open(blob_path, 'wb') as f:
f.write(model_stream)

os.remove(xml_path)
# os.remove(xml_path)

if not keep_ir:
os.remove(new_ir_path)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,9 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
else:
input_len = kv_len
decoder_name = "decoder_layer_prefill"
npu_dpu_groups = 6
# npu_dpu_groups = 6
npu_dpu_groups = None
print("prefill npu dpu groups : ", npu_dpu_groups)

single_decoder = LowBitQwenMultiDecoderlayer(
[1, input_len, num_heads * head_dim],
Expand All @@ -199,7 +201,7 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
)
rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
decoder_name,
temp_dir, True, False,
temp_dir, True, True,
npu_dpu_groups=npu_dpu_groups)

# 0, 1, 2 are input_embed/attention_mask/position_id
Expand Down
Loading