Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
rnwang04 committed Nov 18, 2024
1 parent 5be4913 commit f1a991a
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 22 deletions.
10 changes: 8 additions & 2 deletions python/llm/src/ipex_llm/transformers/npu_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ def from_pretrained(cls, *args, **kwargs):
mixed_precision = kwargs.pop('mixed_precision', False)
quantization_group_size = kwargs.pop("quantization_group_size", 0)
mock_device = kwargs.pop('device', None) # For mock on CPU
compile_full_model = kwargs.pop('compile_full_model', False)
save_directory = kwargs.pop('save_directory', None)

invalidInputError(
quantization_group_size in [0, 32, 64, 128],
Expand Down Expand Up @@ -197,7 +199,9 @@ def from_pretrained(cls, *args, **kwargs):
"max_prompt_len": max_prompt_len,
"inter_pp": inter_pp,
"intra_pp": intra_pp,
"transpose_value_cache": transpose_value_cache
"transpose_value_cache": transpose_value_cache,
"compile_full_model": compile_full_model,
"save_directory": save_directory,
}
model = cls.optimize_npu_model(*args, **optimize_kwargs)
else:
Expand Down Expand Up @@ -271,7 +275,9 @@ def optimize_npu_model(cls, *args, **kwargs):
kv_len=max_context_len,
max_prompt_len=max_prompt_len,
transpose_value_cache=transpose_value_cache,
group_size=quantization_group_size)
group_size=quantization_group_size,
compile_full_model=compile_full_model,
save_directory=save_directory)
model.save_low_bit = types.MethodType(save_low_bit, model)
return model

Expand Down
18 changes: 10 additions & 8 deletions python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,12 @@
import numpy as np


def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True):
def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True, keep_ir=True):
xml_path = os.path.join(dir, model_name + ".xml")
bin_path = os.path.join(dir, model_name + ".bin")
model.save(xml_path)
new_ir_path = os.path.join(dir, model_name + "_new.xml")
new_bin_path = os.path.join(dir, model_name + "_new.bin")
blob_path = os.path.join(dir, model_name + ".blob")

core = Core()
Expand Down Expand Up @@ -54,7 +56,11 @@ def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True
f.write(model_stream)

os.remove(xml_path)
# os.remove(new_ir_path)
os.remove(bin_path)

if not keep_ir:
os.remove(new_ir_path)
os.remove(new_bin_path)

return blob_path

Expand Down Expand Up @@ -95,8 +101,7 @@ def __init__(
if n_splits == 1:
input = self.create_input_op((self.batch_size, self.seq_len, self.hidden_size))
else:
# input = self.create_input_op((1, self.batch_size, self.hidden_size))
input = self.create_input_op((self.batch_size, self.seq_len, self.hidden_size))
input = self.create_input_op((1, self.batch_size, self.hidden_size))

hidden_states = input

Expand Down Expand Up @@ -135,10 +140,7 @@ def __init__(
self.dtype = dtype

# define input
if input_length > 1:
weight = self.parameter((vocab_size, embedding_dim))
else:
weight = self.constant(embedding_weight)
weight = self.constant(embedding_weight)
input = self.parameter((1, input_length), dtype=np.int32)

if padding_idx == -1:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -333,11 +333,7 @@ def convert_llm(model: torch.nn.Module,
with tempfile.TemporaryDirectory() as temp_dir:
if save_directory is not None:
temp_dir = save_directory
temp_dir = r"D:\ruonan\qwen2.5-7B-full-weights-512-new"
if os.path.exists(temp_dir):
import shutil
shutil.rmtree(temp_dir)
os.mkdir(temp_dir)
os.mkdir(temp_dir)
weight_dir = os.path.join(temp_dir, "model_weights")
os.mkdir(weight_dir)
layer_num = len(model.model.layers)
Expand Down
16 changes: 9 additions & 7 deletions python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
np_dtype = np.float16

new_lm_head = LowBitLLMLMHead(
[1, input_length, num_heads * head_dim],
[1, 1, num_heads * head_dim],
num_heads=num_heads,
max_seq_len=1, # seems doesn't matter
rms_norm_eps=rms_norm_eps,
Expand All @@ -57,10 +57,9 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
vocab_size=vocab_size,
n_splits=n_splits_linear
)
suffix = "_prefill" if input_length > 1 else ""
compile = False if input_length > 1 else True
last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, f"lm_head{suffix}",
temp_dir, compile)

last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, f"lm_head",
temp_dir, True, True)

# save weights bins files
if not isinstance(lm_head, SlicedLMHead):
Expand All @@ -84,8 +83,11 @@ def convert_lm_head_and_embedding(model, n_splits_linear, temp_dir, weight_dir,
dtype=np.float16,
input_length=input_length,
)
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, f"embedding{suffix}",
temp_dir, compile)
suffix = "_prefill" if input_length > 1 else ""
compile = False if input_length > 1 else True
if input_length == 0:
first_blob_path = update_names_of_IR_and_export_blob(new_embedding, f"embedding{suffix}",
temp_dir, compile, keep_ir=False)
if input_length > 1:
bin_file = os.path.join(weight_dir, f"model_embedding_input_0.bin")
embedding_layer.weight.to(torch.float16).detach().numpy().tofile(bin_file)
Expand Down

0 comments on commit f1a991a

Please sign in to comment.