Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NPU] Support Baichuan groupwise & gw code refactor #12337

Merged
merged 23 commits into from
Nov 8, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
remove code
  • Loading branch information
cyita committed Nov 7, 2024
commit 4a5cdd6d7bdc1d17ab47b84c0f2960eb24bc9543
11 changes: 1 addition & 10 deletions python/llm/src/ipex_llm/transformers/npu_models/baichuan_mp.py
Original file line number Diff line number Diff line change
@@ -199,20 +199,11 @@ def __init__(
new_key_states = self.convert_to_fp16(curr_key_values[i][0])
new_value_states = self.convert_to_fp16(curr_key_values[i][1])

# print("start compiling")
# self.compile()
print(f"{mode} start compiling - {num_layers}-{n_splits_linear}-{n_splits_down_proj}")
t1 = time.perf_counter()
print("start compiling")
if mode == "prefill" and os.environ.get("IPEX_LLM_NPU_DISABLE_COMPILE_OPT", "0") != "1":
self.compile(npu_dpu_groups=6)
else:
self.compile()
t2 = time.perf_counter()
print(f"{mode} end compiling - {num_layers}-{n_splits_linear}-{n_splits_down_proj}, time: {t2 - t1}s")
xml_path = f"gw/baichuan2-7b-npu-{mode}-{num_layers}-{transpose_value}-{n_splits_linear}-{n_splits_down_proj}.xml"

if not os.path.exists(xml_path):
self.save(xml_path)


def attention(self,
2 changes: 0 additions & 2 deletions python/llm/src/ipex_llm/transformers/npu_models/convert.py
Original file line number Diff line number Diff line change
@@ -71,8 +71,6 @@ def replace_with_QuantizedLinear(layer, qtype, device, modules_to_not_convert,
iqtype = ggml_tensor_qtype[qtype]
qweights, scale = ggml_convert_qtype(layer.weight.data.to(torch.float32),
iqtype, device=device)
# if layer.out_features == 125696:
# group_size = 0
return QuantizedLinear(qweights, scale, layer.bias,
group_size=group_size)

Original file line number Diff line number Diff line change
@@ -87,7 +87,6 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision,
model.llm.config.model_type = "llama"
model = model.llm

print(model)
if model.config.model_type in ["qwen2", "llama", "minicpm", "baichuan"]:
from ipex_llm.transformers.npu_models.common import split_linears
if quantization_group_size == 0:
@@ -107,7 +106,7 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision,
model.apply(lambda m: split_linears(m, n_splits_hidden_size=n_splits_linear,
n_splits_down_proj=n_splits_down_proj,
load=load))

if quantization_group_size != 0:
split_num = model.config.hidden_size // quantization_group_size
if model.config.model_type == "minicpm" and model.config.num_hidden_layers == 40:
1 change: 0 additions & 1 deletion python/llm/src/ipex_llm/transformers/npu_models/linear.py
Original file line number Diff line number Diff line change
@@ -158,7 +158,6 @@ def __init__(
if group_size != 0:
self.scale = Parameter(scale, requires_grad=False)
else:
# print("scale_factor True")
if self.weight.dtype == torch.uint8:
# Int4 we need to double the input channels because weights are compressed
self.inC *= 2
Original file line number Diff line number Diff line change
@@ -374,7 +374,6 @@ def convert_llm(model: torch.nn.Module,
model.lm_head_1.get_fused_lm_head()
model.lm_head_0.get_fused_lm_head()

print(model)
# patch generate function
import types
model.generate = types.MethodType(generate, model)