Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
rnwang04 committed Nov 18, 2024
1 parent f1a991a commit af61841
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 159 deletions.
2 changes: 2 additions & 0 deletions python/llm/src/ipex_llm/transformers/npu_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,8 @@ def optimize_npu_model(cls, *args, **kwargs):
inter_pp = kwargs.pop("inter_pp", None)
intra_pp = kwargs.pop("intra_pp", None)
transpose_value_cache = kwargs.pop("transpose_value_cache", True)
compile_full_model = kwargs.pop('compile_full_model', False)
save_directory = kwargs.pop('save_directory', None)

if hasattr(model, "llm"):
llm = model.llm
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True
f.write(model_stream)

os.remove(xml_path)
os.remove(bin_path)
# os.remove(bin_path)

if not keep_ir:
os.remove(new_ir_path)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -350,9 +350,9 @@ def convert_llm(model: torch.nn.Module,
result = pool.starmap(convert_qwen_layer, param_list)

if compile_full_model:
from .qwen import convert_qwen_prefill_layer, convert_lm_head_and_embedding
convert_qwen_prefill_layer(model, n_splits_linear, n_splits_down_proj,
temp_dir, weight_dir, transpose_value_cache, max_prompt_len, group_size)
convert_qwen_layer(model, 0, n_splits_linear, n_splits_down_proj,
temp_dir, weight_dir, transpose_value_cache, max_prompt_len, group_size,
"prefill")
convert_lm_head_and_embedding(model, n_splits_linear,
temp_dir, weight_dir, max_prompt_len)

Expand Down
233 changes: 78 additions & 155 deletions python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,18 +85,19 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
)
suffix = "_prefill" if input_length > 1 else ""
compile = False if input_length > 1 else True
if input_length == 0:
if input_length == 1:
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, f"embedding{suffix}",
temp_dir, compile, keep_ir=False)
if input_length > 1:
else:
bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
first_blob_path = None
return first_blob_path, last_blob_path


def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
temp_dir, weight_dir, transpose_value_cache, kv_len, group_size,
layernorm_const):
layernorm_const, mode="decode"):
num_heads = model.model.layers[0].self_attn.num_heads
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
head_dim = model.model.layers[0].self_attn.head_dim
Expand Down Expand Up @@ -133,162 +134,84 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
else: # FP16 Linear
np_dtype = np.float16

single_decoder = LowBitQwenMultiDecoderlayer(
[1, 1, num_heads * head_dim],
input_layernorm_weights=[layer_norm_0] if layernorm_const else None,
post_attn_layernorm_weights=[layer_norm_1] if layernorm_const else None,
q_biases=None,
k_biases=None,
v_biases=None,
cached_cos=cached_cos,
cached_sin=cached_sin,
num_heads=num_heads,
num_key_value_heads=num_key_value_heads,
num_layers=1,
max_seq_len=kv_len,
rms_norm_eps=rms_norm_eps,
intermediate_size=intermediate_size,
mode="decode",
transpose_value=transpose_value_cache,
dtype=np_dtype,
n_splits_linear=n_splits_linear,
n_splits_down_proj=n_splits_down_proj,
group_size=group_size
)
rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
f"decoder_layer_{layer_idx}",
temp_dir)
bin_path = os.path.join(temp_dir, f"decoder_layer_{layer_idx}" + ".bin")
os.remove(bin_path)

# 0, 1, 2 are input_embed/attention_mask/position_id
if layernorm_const:
st_idx = 3
else:
input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
layer_norm_0.data.numpy().tofile(input_lm_bin_file)
layer_norm_1.data.numpy().tofile(post_lm_bin_file)
st_idx = 5
q_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx}.bin")
k_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+1}.bin")
v_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+2}.bin")
q_bias.data.numpy().tofile(q_bias_bin_file)
k_bias.data.numpy().tofile(k_bias_bin_file)
v_bias.data.numpy().tofile(v_bias_bin_file)
# 6, 7 are past k/v
for idx, (weight, scale) in enumerate(weights):
bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+5+idx*2}.bin")
weight.numpy().tofile(bin_file)
bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+5+idx*2+1}.bin")
scale.numpy().tofile(bin_file)

del single_decoder


def convert_qwen_prefill_layer(model, n_splits_linear, n_splits_down_proj,
temp_dir, weight_dir, transpose_value_cache, kv_len, group_size):
layer_idx = 0
num_heads = model.model.layers[0].self_attn.num_heads
num_key_value_heads = model.model.layers[0].self_attn.num_key_value_heads
head_dim = model.model.layers[0].self_attn.head_dim
intermediate_size = model.config.intermediate_size
rms_norm_eps = model.config.rms_norm_eps

from ipex_llm.transformers.npu_models.qwen2_mp import LowBitQwenMultiDecoderlayer
curr_layer = model.model.layers[layer_idx]
attn_layer = curr_layer.self_attn
mlp_layer = curr_layer.mlp

weights = []
if n_splits_linear == 1:
for q, k, v, o, g, u in zip(attn_layer.q_proj_dq_list,
attn_layer.k_proj_dq_list,
attn_layer.v_proj_dq_list,
attn_layer.o_proj_dq_list,
mlp_layer.gate_proj_dq_list,
mlp_layer.up_proj_dq_list):
weights.append((q.weight, q.scale))
weights.append((k.weight, k.scale))
weights.append((v.weight, v.scale))
weights.append((o.weight, o.scale))
weights.append((g.weight, g.scale))
weights.append((u.weight, u.scale))
if mode == "decode":
single_decoder = LowBitQwenMultiDecoderlayer(
[1, 1, num_heads * head_dim],
input_layernorm_weights=[layer_norm_0] if layernorm_const else None,
post_attn_layernorm_weights=[layer_norm_1] if layernorm_const else None,
q_biases=None,
k_biases=None,
v_biases=None,
cached_cos=cached_cos,
cached_sin=cached_sin,
num_heads=num_heads,
num_key_value_heads=num_key_value_heads,
num_layers=1,
max_seq_len=kv_len,
rms_norm_eps=rms_norm_eps,
intermediate_size=intermediate_size,
mode="decode",
transpose_value=transpose_value_cache,
dtype=np_dtype,
n_splits_linear=n_splits_linear,
n_splits_down_proj=n_splits_down_proj,
group_size=group_size
)
else:
for layer_list in [attn_layer.q_proj_dq_list, attn_layer.k_proj_dq_list,
attn_layer.v_proj_dq_list, attn_layer.o_proj_dq_list,
mlp_layer.gate_proj_dq_list, mlp_layer.up_proj_dq_list]:
l_weights = []
scales = []
for l in layer_list:
l_weights.append(l.weight)
scales.append(l.scale)
weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0)))

if n_splits_down_proj == 1:
for l in mlp_layer.down_proj_dq_list:
weights.append((l.weight, l.scale))
single_decoder = LowBitQwenMultiDecoderlayer(
[1, kv_len, num_heads * head_dim],
input_layernorm_weights=None,
post_attn_layernorm_weights=None,
q_biases=None,
k_biases=None,
v_biases=None,
cached_cos=cached_cos,
cached_sin=cached_sin,
num_heads=num_heads,
num_key_value_heads=num_key_value_heads,
num_layers=1,
max_seq_len=kv_len,
rms_norm_eps=rms_norm_eps,
intermediate_size=intermediate_size,
mode="prefill",
transpose_value=transpose_value_cache,
dtype=np_dtype,
n_splits_linear=n_splits_linear,
n_splits_down_proj=n_splits_down_proj,
group_size=group_size
)
if mode == "decode":
rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
f"decoder_layer_{layer_idx}",
temp_dir)
else:
l_weights = []
scales = []
for l in mlp_layer.down_proj_dq_list:
l_weights.append(l.weight)
scales.append(l.scale)
weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0)))

q_bias = attn_layer.q_proj_dq_list.q_proj_dq_0.bias.to(torch.float16)
k_bias = attn_layer.k_proj_dq_list.k_proj_dq_0.bias.to(torch.float16)
v_bias = attn_layer.v_proj_dq_list.v_proj_dq_0.bias.to(torch.float16)
cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16)
cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16)
layer_norm_0 = curr_layer.input_layernorm.weight.to(torch.float16)
layer_norm_1 = curr_layer.post_attention_layernorm.weight.to(torch.float16)

if isinstance(weights[0], tuple):
np_dtype = np.int8 if weights[0][0].dtype == torch.int8 else np.uint8
else: # FP16 Linear
np_dtype = np.float16

single_decoder = LowBitQwenMultiDecoderlayer(
[1, kv_len, num_heads * head_dim],
input_layernorm_weights=None,
post_attn_layernorm_weights=None,
q_biases=None,
k_biases=None,
v_biases=None,
cached_cos=cached_cos,
cached_sin=cached_sin,
num_heads=num_heads,
num_key_value_heads=num_key_value_heads,
num_layers=1,
max_seq_len=kv_len,
rms_norm_eps=rms_norm_eps,
intermediate_size=intermediate_size,
mode="prefill",
transpose_value=transpose_value_cache,
dtype=np_dtype,
n_splits_linear=n_splits_linear,
n_splits_down_proj=n_splits_down_proj,
group_size=group_size
)
rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
"decoder_layer_prefill",
temp_dir,
False)
bin_path = os.path.join(temp_dir, "decoder_layer_prefill" + ".bin")
rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
f"decoder_layer_prefill",
temp_dir)
bin_path = os.path.join(temp_dir, f"decoder_layer_{layer_idx}" + ".bin")
os.remove(bin_path)

# 0, 1, 2 are input_embed/attention_mask/position_id
input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
layer_norm_0.data.numpy().tofile(input_lm_bin_file)
layer_norm_1.data.numpy().tofile(post_lm_bin_file)
st_idx = 5
q_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx}.bin")
k_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+1}.bin")
v_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+2}.bin")
q_bias.data.numpy().tofile(q_bias_bin_file)
k_bias.data.numpy().tofile(k_bias_bin_file)
v_bias.data.numpy().tofile(v_bias_bin_file)
if mode == "decode":
if layernorm_const:
st_idx = 3
else:
input_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_3.bin")
post_lm_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_4.bin")
layer_norm_0.data.numpy().tofile(input_lm_bin_file)
layer_norm_1.data.numpy().tofile(post_lm_bin_file)
st_idx = 5
q_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx}.bin")
k_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+1}.bin")
v_bias_bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+2}.bin")
q_bias.data.numpy().tofile(q_bias_bin_file)
k_bias.data.numpy().tofile(k_bias_bin_file)
v_bias.data.numpy().tofile(v_bias_bin_file)
# 6, 7 are past k/v
for idx, (weight, scale) in enumerate(weights):
bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+5+idx*2}.bin")
weight.numpy().tofile(bin_file)
bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+5+idx*2+1}.bin")
scale.numpy().tofile(bin_file)

del single_decoder

0 comments on commit af61841

Please sign in to comment.