Skip to content

Commit

Permalink
[NPU] Support save npu quantized model without npu dependency (#12647)
Browse files Browse the repository at this point in the history
* support save awq

* load quantized model & save npu compiled model

* fix style

* update

* fix dll load issue

* update error message

* fix style
  • Loading branch information
cyita authored Jan 6, 2025
1 parent 502461d commit fae73ee
Show file tree
Hide file tree
Showing 5 changed files with 203 additions and 144 deletions.
130 changes: 80 additions & 50 deletions python/llm/src/ipex_llm/transformers/npu_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

from ipex_llm.utils.common.log4Error import invalidInputError
from ipex_llm.transformers.utils import logger, load_imatrix_data
from ipex_llm.transformers.npu_models.convert import optimize_llm, optimize_llm_post
from ipex_llm.transformers.npu_models.convert import optimize_llm


def patch_flash_attn_import(filename: str) -> List[str]:
Expand Down Expand Up @@ -207,8 +207,6 @@ def from_pretrained(cls, *args, **kwargs):
model = model.eval()
logger.info(f"Finish to convert model")
else:
from intel_npu_acceleration_library.compiler import create_npu_kernels

if optimize_model:
invalidInputError(
max_prompt_len < max_context_len,
Expand All @@ -232,11 +230,14 @@ def from_pretrained(cls, *args, **kwargs):
"convert_model": convert_model,
"save_directory": save_directory,
"fuse_layers": fuse_layers,
"imatrix_data": imatrix_data
"imatrix_data": imatrix_data,
"skip_npu_logic": mock_device == "dummy",
}
# Dummy will skip npu related logic and save the quantized model
if mock_device == "dummy":
model.save_low_bit = types.MethodType(save_low_bit, model)
model = cls.optimize_npu_model(*args, **optimize_kwargs)
else:
from ipex_llm.transformers.npu_models.convert import optimize_llm
optimize_llm(model)
with torch.no_grad():
cls.load_convert(qtype, model, "cpu", modules_to_not_convert,
Expand All @@ -258,7 +259,6 @@ def from_pretrained(cls, *args, **kwargs):
def optimize_npu_model(cls, *args, **kwargs):

from ipex_llm.transformers.npu_models.convert_mp import optimize_llm_pre, optimize_llm
from intel_npu_acceleration_library.compiler import create_npu_kernels

model = kwargs.pop("model")
qtype = kwargs.pop("qtype", "sym_int4_rtn")
Expand All @@ -275,6 +275,7 @@ def optimize_npu_model(cls, *args, **kwargs):
save_directory = kwargs.pop('save_directory', None)
fuse_layers = kwargs.pop('fuse_layers', None)
imatrix_data = kwargs.pop('imatrix_data', None)
skip_npu_logic = kwargs.pop("skip_npu_logic", False)
invalidInputError(save_directory is not None,
"Please provide the path to save converted model "
"through `save_directory`.")
Expand All @@ -294,51 +295,58 @@ def optimize_npu_model(cls, *args, **kwargs):
cls.load_convert(qtype, model, "cpu", modules_to_not_convert,
quantization_group_size, imatrix_data,
*args, **kwargs)
create_npu_kernels(llm)
if not skip_npu_logic:
from intel_npu_acceleration_library.compiler import create_npu_kernels
create_npu_kernels(llm)
model = model.eval()
logger.info(f"Finish to convert model")
model.config.update({"bigdl_transformers_low_bit": qtype})
model.share_memory()

if not pipeline:
if model.config.model_type in ["qwen2", "llama", "minicpm"]:
from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
optimize_llm_single_process(
llm,
kv_len=max_context_len,
max_prompt_len=max_prompt_len,
transpose_value_cache=transpose_value_cache,
group_size=quantization_group_size,
qtype=qtype,
save_directory=save_directory,
fuse_layers=fuse_layers,
has_llm=hasattr(model, "llm")
)
else:
optimize_llm(
llm,
max_context_len=max_context_len,
max_prompt_len=max_prompt_len,
inter_pp=inter_pp,
intra_pp=intra_pp,
transpose_value_cache=transpose_value_cache,
group_size=quantization_group_size
)
if skip_npu_logic:
model.save_low_bit(model_dir=save_directory)
else:
from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
import convert_llm
convert_llm(llm,
model.share_memory()

if not pipeline:
if model.config.model_type in ["qwen2", "llama", "minicpm"]:
from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
optimize_llm_single_process(
llm,
kv_len=max_context_len,
max_prompt_len=max_prompt_len,
transpose_value_cache=transpose_value_cache,
group_size=quantization_group_size,
qtype=qtype,
convert_model=convert_model,
save_directory=save_directory,
fuse_layers=fuse_layers)
model.save_low_bit = types.MethodType(save_low_bit, model)
model.save_low_bit(save_directory)
logger.info(f"Converted model has already saved to {save_directory}.")
fuse_layers=fuse_layers,
has_llm=hasattr(model, "llm")
)
else:
optimize_llm(
llm,
max_context_len=max_context_len,
max_prompt_len=max_prompt_len,
inter_pp=inter_pp,
intra_pp=intra_pp,
transpose_value_cache=transpose_value_cache,
group_size=quantization_group_size
)
else:
from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
import convert_llm
convert_llm(llm,
kv_len=max_context_len,
max_prompt_len=max_prompt_len,
transpose_value_cache=transpose_value_cache,
group_size=quantization_group_size,
qtype=qtype,
convert_model=convert_model,
save_directory=save_directory,
fuse_layers=fuse_layers)
model.save_low_bit = types.MethodType(save_low_bit, model)
model.save_low_bit(save_directory)
logger.info(f"Converted model has already saved to {save_directory}.")

return model

@classmethod
Expand Down Expand Up @@ -379,6 +387,7 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs)
intra_pp = kwargs.pop("intra_pp", None)
transpose_value_cache = kwargs.pop("transpose_value_cache", True)
modules_to_not_convert = kwargs.pop("modules_to_not_convert", [])
save_directory = kwargs.pop('save_directory', None)

from transformers.models.auto.configuration_auto import AutoConfig
from transformers.modeling_utils import no_init_weights, get_state_dict_dtype
Expand Down Expand Up @@ -650,16 +659,37 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs)
param.requires_grad_(False)

if optimize_model and not pipeline:
from ipex_llm.transformers.npu_models.convert_mp import optimize_llm
optimize_llm(
llm,
max_context_len=max_context_len,
max_prompt_len=max_prompt_len,
inter_pp=inter_pp,
intra_pp=intra_pp,
transpose_value_cache=transpose_value_cache,
group_size=quantization_group_size
)
if model.config.model_type in ["qwen2", "llama", "minicpm"]:
from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
if save_directory is None:
invalidInputError(False,
"Please specify the save_directory, the path of folder " +
"to save the compiled NPU model. If path not exists, " +
"the compiled NPU model will be saved there. " +
"Else, program will exit.")

optimize_llm_single_process(
llm,
kv_len=max_context_len,
max_prompt_len=max_prompt_len,
transpose_value_cache=transpose_value_cache,
group_size=quantization_group_size,
qtype=qtype,
save_directory=save_directory,
fuse_layers=None,
has_llm=hasattr(model, "llm")
)
else:
from ipex_llm.transformers.npu_models.convert_mp import optimize_llm
optimize_llm(
llm,
max_context_len=max_context_len,
max_prompt_len=max_prompt_len,
inter_pp=inter_pp,
intra_pp=intra_pp,
transpose_value_cache=transpose_value_cache,
group_size=quantization_group_size
)
elif optimize_model and pipeline:
from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
import convert_llm
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import importlib
import numpy as np
from ipex_llm.transformers.low_bit_linear import LowBitLinear, FP4Params
from ipex_llm.transformers.npu_models.lm_head import LMHeadLinear, SlicedLMHead
from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
from ipex_llm.utils.common.log4Error import invalidInputError


Expand Down
18 changes: 15 additions & 3 deletions python/llm/src/ipex_llm/transformers/npu_models/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,25 @@
# SPDX-License-Identifier: Apache 2.0
#

from intel_npu_acceleration_library.quantization import quantize_tensor, compress_to_i4
from intel_npu_acceleration_library.dtypes import NPUDtype

import os
import torch
from torch.nn import Parameter
import uuid
import math
from intel_npu_acceleration_library.backend import run_matmul
from typing import Optional, Union
from ipex_llm.utils.common import invalidInputError
import importlib


def is_acclib_available():
return importlib.util.find_spec("intel_npu_acceleration_library") is not None


if is_acclib_available():
from intel_npu_acceleration_library.quantization import quantize_tensor, compress_to_i4
from intel_npu_acceleration_library.dtypes import NPUDtype
from intel_npu_acceleration_library.backend import run_matmul


class Linear(torch.nn.Module):
Expand Down Expand Up @@ -63,6 +72,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
if self.training:
out = self._mm(x, self.weight, None)
else:
from intel_npu_acceleration_library.backend import run_matmul
out = run_matmul(x, self.weight, None, self.op_id)

if self.bias is None:
Expand Down Expand Up @@ -105,6 +115,8 @@ def fromTensor(
Returns:
Union[Linear, QuantizedLinear]: A NPU linear layer
"""
from intel_npu_acceleration_library.quantization import quantize_tensor, compress_to_i4
from intel_npu_acceleration_library.dtypes import NPUDtype
if dtype.is_floating_point:
if bias is None:
return Linear(weight.to(dtype), None)
Expand Down
91 changes: 1 addition & 90 deletions python/llm/src/ipex_llm/transformers/npu_models/lm_head.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,96 +16,6 @@
import torch
from torch import nn
import numpy as np
from filelock import FileLock
from intel_npu_acceleration_library.backend import NNFactory
from intel_npu_acceleration_library.backend.bindings import lib as backend_lib


class LMHeadLinear(NNFactory):
"""Quantized Linear class for sliced lm_head, computing a matrix matrix multiplication
with weights prefetching."""

def __init__(
self,
inC: int,
outC: int,
batch: int,
split_num: int = 2,
profile: bool = False,
device: str = "NPU",
dtype: np.dtype = np.int8,
use_split: bool = False,
group_size: int = 0,
asym: bool = False,
):
"""Initialize the LMHeadLinear class.
Args:
inC (int): input channels
outC (int): output channels
batch (int): batch
split_num (int): split in_features of lm_head to how many parts
profile (bool): Enable/Disable profiling. Defaults to False.
device (str): Target device, default to "NPU".
dtype (np.dtype): weights datatype. Defaults to np.int8.
"""
super().__init__(profile, device)
self.inC, self.outC = inC, outC
self.batch = batch

self.split_num = split_num
if use_split:
input = self.parameter((1, self.batch, self.inC))
res = self.dq_split_linear(input, self.split_num, self.outC, self.inC, wt_dtype=dtype,
scale_factor=(group_size == 0), asym=asym)
else:
input = self.parameter((self.batch, self.inC))
split_size = self.inC // split_num // 2 * 2

for i in range(self.split_num):
start_idx = i * split_size
end_idx = (i + 1) * split_size if i < self.split_num - 1 else self.inC
input_slice = self.slice(input, begin=[0, start_idx],
end=[self.batch, end_idx])
linear_slice = self.linear(input_slice, outC, split_size, bias=False,
wt_dtype=dtype, asym=asym)
if i == 0:
res = linear_slice
else:
res += linear_slice

print("start compiling lm_head")
self.compile()
print("end compiling lm_head")

def set_weights(self, op_id, weights):
self.set_weights_async(op_id, weights)
with FileLock(f"lmhead_run.lock"):
backend_lib.run(self._mm)

def set_weights_async(self, op_id, weights):
self.setWeights(1, op_id, *weights)

def run(
self, X: np.ndarray
) -> np.ndarray:
"""Run the layer: $X * (W * S)^T$ .
Args:
X (np.ndarray): activation
Raises:
RuntimeError: Input, weights or scale shape mismatch
Returns:
np.ndarray: result
"""
self.set_input_tensor(X, 0)
self.elapsed = backend_lib.run(self._mm)
if len(self.out) == 1:
return self.out[0]
return self.out


class SlicedLMHead(nn.Module):
Expand Down Expand Up @@ -160,6 +70,7 @@ def get_weight_dtype(self):
return self.lm_heads[0].weight.dtype

def get_fused_lm_head(self):
from ipex_llm.transformers.npu_models.lm_head_linear import LMHeadLinear
np_dtype = np.uint8 if self.get_weight_dtype() == torch.uint8 else np.int8
self.fused_lm_head = LMHeadLinear(self.inC, self.outC, 1, self.split_num,
False, "NPU", dtype=np_dtype, use_split=self.use_split,
Expand Down
Loading

0 comments on commit fae73ee

Please sign in to comment.