Skip to content

Commit

Permalink
fix style
Browse files Browse the repository at this point in the history
  • Loading branch information
rnwang04 committed Dec 24, 2024
1 parent c37a180 commit 50ace72
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 19 deletions.
2 changes: 1 addition & 1 deletion python/llm/src/ipex_llm/transformers/npu_models/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,5 +111,5 @@ def is_auto_round_model(model: torch.nn.Module):
if hasattr(model, "quantization_config"):
quant_config = getattr(model.config, "quantization_config", None)
if quant_config is not None and quant_config.quant_method == "intel/auto-round":
return True
return True
return False
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def unpack_auto_round_layer(layer, qtype="sym_int4_rtn"):
n, m = layer.infeatures, layer.outfeatures
weight = layer.qweight.to("cpu")
scale = layer.scales.to("cpu")
zeros = layer.qzeros.to("cpu") # np.int32, 1 x m // 4
zeros = layer.qzeros.to("cpu") # np.int32, 1 x m // 4
bits = layer.bits

scale = scale.t().contiguous()
Expand All @@ -38,13 +38,13 @@ def unpack_auto_round_layer(layer, qtype="sym_int4_rtn"):

for i in range(0, n // num):
for j in range(0, num):
int_weight[i * num + j, :] = (( weight[i, :] >> (j * bits) ) & 0x0000000F ).to(torch.uint8)
int_weight[i*num + j, :] = ((weight[i, :] >> (j*bits)) & 0x0000000F).to(torch.uint8)

int_weight = (int_weight - 8).to(torch.int8) # n, m
qweights = int_weight.t().contiguous() # m, n
int_weight = (int_weight - 8).to(torch.int8) # n, m
qweights = int_weight.t().contiguous() # m, n

# if we want to transform it to our NPU format, uncomment below code
qweights = qweights.reshape(m, -1 , 2) # m * n/2 * 2
qweights = qweights.reshape(m, -1, 2) # m * n/2 * 2
low_bit, high_bit = qweights.split(1, dim=-1)
high_bit = high_bit.squeeze().view(torch.int8)
low_bit = low_bit.squeeze().view(torch.int8)
Expand All @@ -61,16 +61,16 @@ def unpack_auto_round_layer(layer, qtype="sym_int4_rtn"):

for i in range(0, m // num):
for j in range(0, num):
int_zero[:, i * num + j] = (( zero[:, i] >> (j * bits) ) & 0x0000000F ).to(torch.uint8)
int_zero[:, i*num + j] = ((zero[:, i] >> (j*bits)) & 0x0000000F).to(torch.uint8)

zero = int_zero.to(torch.int8)
zero = zero.t().contiguous() # m, 1
zero = zero.t().contiguous() # m, 1
zero = zero.to(torch.float32) * -1 * scale
zero += 8 * scale
else:
invalidInputError(False,
f"unpack_auto_round_layer does not support qtype {qtype}.")
return qweights.view(torch.uint8), scale.to(torch.float16), zero.to(torch.float16)
return qweights.view(torch.uint8), scale.to(torch.float16), zero.to(torch.float16)


@module_optimization
Expand All @@ -85,7 +85,7 @@ def replace_with_QuantizedLinear(layer, qtype, device, modules_to_not_convert,
# auto-round's QuantLinear
qweights, scale, zero = unpack_auto_round_layer(layer, qtype=qtype)
return QuantizedLinear(qweights, scale, zero, layer.bias,
group_size=group_size, qtype=qtype)
group_size=group_size, qtype=qtype)
elif isinstance(layer, torch.nn.Linear) and not hasattr(layer, "qtype"):
enable_scale_search = (os.environ.get("IPEX_LLM_NPU_QUANTIZATION_OPT", "0") != "0" or
os.environ.get("IPEX_LLM_NPU_QUANTIZATION_HQQ", "0") != "0")
Expand All @@ -101,9 +101,10 @@ def replace_with_QuantizedLinear(layer, qtype, device, modules_to_not_convert,
group_size=group_size, qtype=qtype)


def convert_auto_round_model_to_npu_model(model, save_directory, max_context_len = 1024, max_prompt_len = 960,
transpose_value_cache = True, fuse_layers = None, mixed_precision = False,
inter_pp = None, intra_pp = None, optimize_model=True):
def convert_auto_round_model_to_npu_model(model, save_directory, max_context_len=1024,
max_prompt_len=960, transpose_value_cache=True,
fuse_layers=None, mixed_precision=False,
inter_pp=None, intra_pp=None, optimize_model=True):
quant_config = getattr(model.config, "quantization_config", None)
if quant_config is None and quant_config.quant_method != "intel/auto-round":
exit(-1)
Expand All @@ -112,16 +113,16 @@ def convert_auto_round_model_to_npu_model(model, save_directory, max_context_len
group_size = quant_config.group_size
sym = quant_config.sym

if sym and bits == 4 :
if sym and bits == 4:
qtype = "sym_int4_rtn"
elif not sym and bits == 4:
qtype = "asym_int4_rtn"
elif sym and bits == 4:
elif sym and bits == 4:
qtype = "sym_int8_rtn"
else:
invalidInputError(False,
"Invalid dtype.")
"Invalid dtype.")

if group_size == -1:
quantization_group_size = 0
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ def convert_llm(model: torch.nn.Module,
n_splits_down_proj = 1
else:
if is_auto_round_model(model):
n_splits_down_proj = 1 # for auto-round
n_splits_down_proj = 1 # for auto-round
else:
n_splits_down_proj = 2 if (model.config.intermediate_size == 18944 or
os.environ.get("IPEX_LLM_NPU_MTL", "0") == "1" or
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,8 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down
weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0),
torch.stack(zeros, axis=0)))
else:
weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0)))
weights.append((torch.stack(l_weights, axis=0),
torch.stack(scales, axis=0)))
else:
for layer in [attn_layer.q_proj, attn_layer.k_proj,
attn_layer.v_proj, attn_layer.o_proj,
Expand Down

0 comments on commit 50ace72

Please sign in to comment.