Skip to content

Commit

Permalink
[NPU pipeline] Support save & load and update examples (#12293)
Browse files Browse the repository at this point in the history
* support save & load, update llama examples

* update baichuan2 example

* update readme
  • Loading branch information
rnwang04 authored Oct 30, 2024
1 parent 5a15098 commit 2b2cb9c
Show file tree
Hide file tree
Showing 8 changed files with 147 additions and 56 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,12 @@ python baichuan2.py

Arguments info:
- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the model (e.g. `meta-llama/Llama-2-7b-chat-hf`) to be downloaded, or the path to the huggingface checkpoint folder.
- `--lowbit-path LOWBIT_MODEL_PATH`: argument defining the path to save/load lowbit version of the model. If it is an empty string, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded. If it is an existing path, the lowbit model in `LOWBIT_MODEL_PATH` will be loaded. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted lowbit version will be saved into `LOWBIT_MODEL_PATH`. It is default to be `''`, i.e. an empty string.
- `--prompt PROMPT`: argument defining the prompt to be infered. It is default to be `What is AI?`.
- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
- `--max-output-len MAX_OUTPUT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
- `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
- `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`.
- `--disable-transpose-value-cache`: Disable the optimization of transposing value cache.

### Sample Output
#### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#


import os
import torch
import time
import argparse
Expand Down Expand Up @@ -48,28 +49,49 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
help="The huggingface repo id for the Baichuan2 model to be downloaded"
", or the path to the huggingface checkpoint folder",
)
parser.add_argument("--lowbit-path", type=str,
default="",
help="The path to the lowbit model folder, leave blank if you do not want to save. \
If path not exists, lowbit model will be saved there. \
Else, lowbit model will be loaded.",
)
parser.add_argument('--prompt', type=str, default="What is AI?",
help='Prompt to infer')
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
parser.add_argument("--max-context-len", type=int, default=1024)
parser.add_argument("--max-prompt-len", type=int, default=960)
parser.add_argument("--max-prompt-len", type=int, default=512)
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)

args = parser.parse_args()
model_path = args.repo_id_or_model_path

model = AutoModelForCausalLM.from_pretrained(model_path,
optimize_model=True,
pipeline=True,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
torch_dtype=torch.float16,
attn_implementation="eager",
transpose_value_cache=not args.disable_transpose_value_cache,
trust_remote_code=True)
if not args.lowbit_path or not os.path.exists(args.lowbit_path):
model = AutoModelForCausalLM.from_pretrained(model_path,
optimize_model=True,
pipeline=True,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
torch_dtype=torch.float16,
attn_implementation="eager",
transpose_value_cache=not args.disable_transpose_value_cache,
trust_remote_code=True)
else:
model = AutoModelForCausalLM.load_low_bit(
args.lowbit_path,
attn_implementation="eager",
torch_dtype=torch.float16,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
pipeline=True,
transpose_value_cache=not args.disable_transpose_value_cache,
trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

if args.lowbit_path and not os.path.exists(args.lowbit_path):
model.save_low_bit(args.lowbit_path)

DEFAULT_SYSTEM_PROMPT = """\
"""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#


import os
import torch
import time
import argparse
Expand Down Expand Up @@ -48,29 +49,49 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
help="The huggingface repo id for the Llama2 model to be downloaded"
", or the path to the huggingface checkpoint folder",
)
parser.add_argument("--lowbit-path", type=str,
default="",
help="The path to the lowbit model folder, leave blank if you do not want to save. \
If path not exists, lowbit model will be saved there. \
Else, lowbit model will be loaded.",
)
parser.add_argument('--prompt', type=str, default="What is AI?",
help='Prompt to infer')
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
parser.add_argument("--max-context-len", type=int, default=1024)
parser.add_argument("--max-prompt-len", type=int, default=512)
parser.add_argument("--quantization_group_size", type=int, default=0)
parser.add_argument("--max-prompt-len", type=int, default=960)
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)

args = parser.parse_args()
model_path = args.repo_id_or_model_path

model = AutoModelForCausalLM.from_pretrained(model_path,
optimize_model=True,
pipeline=True,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
quantization_group_size=args.quantization_group_size,
torch_dtype=torch.float16,
attn_implementation="eager",
transpose_value_cache=not args.disable_transpose_value_cache)
if not args.lowbit_path or not os.path.exists(args.lowbit_path):
model = AutoModelForCausalLM.from_pretrained(model_path,
optimize_model=True,
pipeline=True,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
quantization_group_size=args.quantization_group_size,
torch_dtype=torch.float16,
attn_implementation="eager",
transpose_value_cache=not args.disable_transpose_value_cache)
else:
model = AutoModelForCausalLM.load_low_bit(
args.lowbit_path,
attn_implementation="eager",
torch_dtype=torch.float16,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
pipeline=True,
transpose_value_cache=not args.disable_transpose_value_cache,
)

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

if args.lowbit_path and not os.path.exists(args.lowbit_path):
model.save_low_bit(args.lowbit_path)

DEFAULT_SYSTEM_PROMPT = """\
"""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#


import os
import torch
import time
import argparse
Expand Down Expand Up @@ -54,29 +55,49 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
help="The huggingface repo id for the Llama3 model to be downloaded"
", or the path to the huggingface checkpoint folder",
)
parser.add_argument("--lowbit-path", type=str,
default="",
help="The path to the lowbit model folder, leave blank if you do not want to save. \
If path not exists, lowbit model will be saved there. \
Else, lowbit model will be loaded.",
)
parser.add_argument('--prompt', type=str, default="What is AI?",
help='Prompt to infer')
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
parser.add_argument("--max-context-len", type=int, default=1024)
parser.add_argument("--max-prompt-len", type=int, default=960)
parser.add_argument("--max-prompt-len", type=int, default=512)
parser.add_argument("--quantization_group_size", type=int, default=0)
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)

args = parser.parse_args()
model_path = args.repo_id_or_model_path

model = AutoModelForCausalLM.from_pretrained(model_path,
torch_dtype=torch.float16,
optimize_model=True,
pipeline=True,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
quantization_group_size=args.quantization_group_size,
attn_implementation="eager",
transpose_value_cache=not args.disable_transpose_value_cache)
if not args.lowbit_path or not os.path.exists(args.lowbit_path):
model = AutoModelForCausalLM.from_pretrained(model_path,
torch_dtype=torch.float16,
optimize_model=True,
pipeline=True,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
quantization_group_size=args.quantization_group_size,
attn_implementation="eager",
transpose_value_cache=not args.disable_transpose_value_cache)
else:
model = AutoModelForCausalLM.load_low_bit(
args.lowbit_path,
attn_implementation="eager",
torch_dtype=torch.float16,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
pipeline=True,
transpose_value_cache=not args.disable_transpose_value_cache,
)

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

if args.lowbit_path and not os.path.exists(args.lowbit_path):
model.save_low_bit(args.lowbit_path)

print("-" * 80)
print("done")
with torch.inference_mode():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ Arguments info:
- `--lowbit-path LOWBIT_MODEL_PATH`: argument defining the path to save/load lowbit version of the model. If it is an empty string, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded. If it is an existing path, the lowbit model in `LOWBIT_MODEL_PATH` will be loaded. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted lowbit version will be saved into `LOWBIT_MODEL_PATH`. It is default to be `''`, i.e. an empty string.
- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `What is AI?`.
- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
- `--max-output-len MAX_OUTPUT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
- `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
- `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`.
- `--disable-transpose-value-cache`: Disable the optimization of transposing value cache.

Expand Down
32 changes: 22 additions & 10 deletions python/llm/src/ipex_llm/transformers/npu_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,8 @@ def from_pretrained(cls, *args, **kwargs):

logger.info(f"Converting model, it may takes up to several minutes ...")

model.config.update({"optimize_model": optimize_model})

if mock_device == "cpu":
with torch.no_grad():
# Only mock quantization_group_size=0 for now
Expand Down Expand Up @@ -262,7 +264,6 @@ def optimize_npu_model(cls, *args, **kwargs):
transpose_value_cache=transpose_value_cache,
group_size=quantization_group_size
)
model.save_low_bit = types.MethodType(save_low_bit, model)
else:
from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
import convert_llm
Expand All @@ -271,7 +272,7 @@ def optimize_npu_model(cls, *args, **kwargs):
max_prompt_len=max_prompt_len,
transpose_value_cache=transpose_value_cache,
group_size=quantization_group_size)

model.save_low_bit = types.MethodType(save_low_bit, model)
return model

@classmethod
Expand Down Expand Up @@ -304,8 +305,10 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs)
ignore_argument(kwargs, "pipeline_parallel_stages")
ignore_argument(kwargs, "mixed_precision")
ignore_argument(kwargs, "quantization_group_size")
optimize_model = kwargs.pop("optimize_model", False)
max_output_len = kwargs.pop("max_output_len", 1024)
ignore_argument(kwargs, "optimize_model")
pipeline = kwargs.pop("pipeline", False)
max_context_len = kwargs.pop("max_context_len", 1024)
max_context_len = max_context_len - 1
max_prompt_len = kwargs.pop("max_prompt_len", 512)
inter_pp = kwargs.pop("inter_pp", None)
intra_pp = kwargs.pop("intra_pp", None)
Expand Down Expand Up @@ -355,6 +358,7 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs)
bigdl_lcmu_enabled = config_dict.pop("bigdl_lcmu_enabled", True)
mixed_precision = config_dict.pop("mixed_precision", False)
quantization_group_size = config_dict.pop("group_size", 0)
optimize_model = config_dict.pop("optimize_model", False)

invalidInputError(
qtype,
Expand Down Expand Up @@ -450,13 +454,12 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs)
quant_device = "meta" if bigdl_lcmu_enabled else "cpu"
logger.info(f"Converting model, it may takes up to several minutes ...")
from intel_npu_acceleration_library.compiler import create_npu_kernels

if optimize_model:
invalidInputError(
max_prompt_len < max_output_len,
max_prompt_len < max_context_len,
(
f"max_prompt_len ({max_prompt_len}) should be less"
" than max_output_len ({max_output_len})"
" than max_context_len ({max_context_len})"
),
)
from ipex_llm.transformers.npu_models.convert_mp import optimize_llm_pre
Expand All @@ -468,7 +471,8 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs)

with torch.no_grad():
optimize_llm_pre(model, qtype, mixed_precision,
quantization_group_size=quantization_group_size)
quantization_group_size=quantization_group_size,
load=bigdl_lcmu_enabled)
cls.load_convert(qtype, model, quant_device, modules_to_not_convert,
quantization_group_size, *model_args, **kwargs)
create_npu_kernels(llm)
Expand Down Expand Up @@ -541,17 +545,25 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs)
for param in model.parameters():
param.requires_grad_(False)

if optimize_model:
if optimize_model and not pipeline:
from ipex_llm.transformers.npu_models.convert_mp import optimize_llm
optimize_llm(
llm,
max_output_len=max_output_len,
max_output_len=max_context_len,
max_prompt_len=max_prompt_len,
inter_pp=inter_pp,
intra_pp=intra_pp,
transpose_value_cache=transpose_value_cache,
group_size=quantization_group_size
)
elif optimize_model and pipeline:
from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
import convert_llm
convert_llm(llm,
kv_len=max_context_len,
max_prompt_len=max_prompt_len,
transpose_value_cache=transpose_value_cache,
group_size=quantization_group_size)

return model

Expand Down
Loading

0 comments on commit 2b2cb9c

Please sign in to comment.