Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NPU pipeline] Support save & load and update examples #12293

Merged
merged 3 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,12 @@ python baichuan2.py

Arguments info:
- `--repo-id-or-model-path REPO_ID_OR_MODEL_PATH`: argument defining the huggingface repo id for the model (e.g. `meta-llama/Llama-2-7b-chat-hf`) to be downloaded, or the path to the huggingface checkpoint folder.
- `--lowbit-path LOWBIT_MODEL_PATH`: argument defining the path to save/load lowbit version of the model. If it is an empty string, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded. If it is an existing path, the lowbit model in `LOWBIT_MODEL_PATH` will be loaded. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted lowbit version will be saved into `LOWBIT_MODEL_PATH`. It is default to be `''`, i.e. an empty string.
- `--prompt PROMPT`: argument defining the prompt to be infered. It is default to be `What is AI?`.
- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
- `--max-output-len MAX_OUTPUT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
- `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
- `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`.
- `--disable-transpose-value-cache`: Disable the optimization of transposing value cache.

### Sample Output
#### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#


import os
import torch
import time
import argparse
Expand Down Expand Up @@ -48,28 +49,49 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
help="The huggingface repo id for the Baichuan2 model to be downloaded"
", or the path to the huggingface checkpoint folder",
)
parser.add_argument("--lowbit-path", type=str,
default="",
help="The path to the lowbit model folder, leave blank if you do not want to save. \
If path not exists, lowbit model will be saved there. \
Else, lowbit model will be loaded.",
)
parser.add_argument('--prompt', type=str, default="What is AI?",
help='Prompt to infer')
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
parser.add_argument("--max-context-len", type=int, default=1024)
parser.add_argument("--max-prompt-len", type=int, default=960)
parser.add_argument("--max-prompt-len", type=int, default=512)
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)

args = parser.parse_args()
model_path = args.repo_id_or_model_path

model = AutoModelForCausalLM.from_pretrained(model_path,
optimize_model=True,
pipeline=True,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
torch_dtype=torch.float16,
attn_implementation="eager",
transpose_value_cache=not args.disable_transpose_value_cache,
trust_remote_code=True)
if not args.lowbit_path or not os.path.exists(args.lowbit_path):
model = AutoModelForCausalLM.from_pretrained(model_path,
optimize_model=True,
pipeline=True,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
torch_dtype=torch.float16,
attn_implementation="eager",
transpose_value_cache=not args.disable_transpose_value_cache,
trust_remote_code=True)
else:
model = AutoModelForCausalLM.load_low_bit(
args.lowbit_path,
attn_implementation="eager",
torch_dtype=torch.float16,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
pipeline=True,
transpose_value_cache=not args.disable_transpose_value_cache,
trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

if args.lowbit_path and not os.path.exists(args.lowbit_path):
model.save_low_bit(args.lowbit_path)

DEFAULT_SYSTEM_PROMPT = """\
"""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#


import os
import torch
import time
import argparse
Expand Down Expand Up @@ -48,29 +49,49 @@ def get_prompt(message: str, chat_history: list[tuple[str, str]],
help="The huggingface repo id for the Llama2 model to be downloaded"
", or the path to the huggingface checkpoint folder",
)
parser.add_argument("--lowbit-path", type=str,
default="",
help="The path to the lowbit model folder, leave blank if you do not want to save. \
If path not exists, lowbit model will be saved there. \
Else, lowbit model will be loaded.",
)
parser.add_argument('--prompt', type=str, default="What is AI?",
help='Prompt to infer')
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
parser.add_argument("--max-context-len", type=int, default=1024)
parser.add_argument("--max-prompt-len", type=int, default=512)
parser.add_argument("--quantization_group_size", type=int, default=0)
parser.add_argument("--max-prompt-len", type=int, default=960)
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)

args = parser.parse_args()
model_path = args.repo_id_or_model_path

model = AutoModelForCausalLM.from_pretrained(model_path,
optimize_model=True,
pipeline=True,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
quantization_group_size=args.quantization_group_size,
torch_dtype=torch.float16,
attn_implementation="eager",
transpose_value_cache=not args.disable_transpose_value_cache)
if not args.lowbit_path or not os.path.exists(args.lowbit_path):
model = AutoModelForCausalLM.from_pretrained(model_path,
optimize_model=True,
pipeline=True,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
quantization_group_size=args.quantization_group_size,
torch_dtype=torch.float16,
attn_implementation="eager",
transpose_value_cache=not args.disable_transpose_value_cache)
else:
model = AutoModelForCausalLM.load_low_bit(
args.lowbit_path,
attn_implementation="eager",
torch_dtype=torch.float16,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
pipeline=True,
transpose_value_cache=not args.disable_transpose_value_cache,
)

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

if args.lowbit_path and not os.path.exists(args.lowbit_path):
model.save_low_bit(args.lowbit_path)

DEFAULT_SYSTEM_PROMPT = """\
"""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#


import os
import torch
import time
import argparse
Expand Down Expand Up @@ -54,29 +55,49 @@ def get_prompt(user_input: str, chat_history: list[tuple[str, str]],
help="The huggingface repo id for the Llama3 model to be downloaded"
", or the path to the huggingface checkpoint folder",
)
parser.add_argument("--lowbit-path", type=str,
default="",
help="The path to the lowbit model folder, leave blank if you do not want to save. \
If path not exists, lowbit model will be saved there. \
Else, lowbit model will be loaded.",
)
parser.add_argument('--prompt', type=str, default="What is AI?",
help='Prompt to infer')
parser.add_argument("--n-predict", type=int, default=32, help="Max tokens to predict")
parser.add_argument("--max-context-len", type=int, default=1024)
parser.add_argument("--max-prompt-len", type=int, default=960)
parser.add_argument("--max-prompt-len", type=int, default=512)
parser.add_argument("--quantization_group_size", type=int, default=0)
parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False)

args = parser.parse_args()
model_path = args.repo_id_or_model_path

model = AutoModelForCausalLM.from_pretrained(model_path,
torch_dtype=torch.float16,
optimize_model=True,
pipeline=True,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
quantization_group_size=args.quantization_group_size,
attn_implementation="eager",
transpose_value_cache=not args.disable_transpose_value_cache)
if not args.lowbit_path or not os.path.exists(args.lowbit_path):
model = AutoModelForCausalLM.from_pretrained(model_path,
torch_dtype=torch.float16,
optimize_model=True,
pipeline=True,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
quantization_group_size=args.quantization_group_size,
attn_implementation="eager",
transpose_value_cache=not args.disable_transpose_value_cache)
else:
model = AutoModelForCausalLM.load_low_bit(
args.lowbit_path,
attn_implementation="eager",
torch_dtype=torch.float16,
max_context_len=args.max_context_len,
max_prompt_len=args.max_prompt_len,
pipeline=True,
transpose_value_cache=not args.disable_transpose_value_cache,
)

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

if args.lowbit_path and not os.path.exists(args.lowbit_path):
model.save_low_bit(args.lowbit_path)

print("-" * 80)
print("done")
with torch.inference_mode():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ Arguments info:
- `--lowbit-path LOWBIT_MODEL_PATH`: argument defining the path to save/load lowbit version of the model. If it is an empty string, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded. If it is an existing path, the lowbit model in `LOWBIT_MODEL_PATH` will be loaded. If it is a non-existing path, the original pretrained model specified by `REPO_ID_OR_MODEL_PATH` will be loaded, and the converted lowbit version will be saved into `LOWBIT_MODEL_PATH`. It is default to be `''`, i.e. an empty string.
- `--prompt PROMPT`: argument defining the prompt to be infered (with integrated prompt format for chat). It is default to be `What is AI?`.
- `--n-predict N_PREDICT`: argument defining the max number of tokens to predict. It is default to be `32`.
- `--max-output-len MAX_OUTPUT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
- `--max-context-len MAX_CONTEXT_LEN`: Defines the maximum sequence length for both input and output tokens. It is default to be `1024`.
- `--max-prompt-len MAX_PROMPT_LEN`: Defines the maximum number of tokens that the input prompt can contain. It is default to be `512`.
- `--disable-transpose-value-cache`: Disable the optimization of transposing value cache.

Expand Down
32 changes: 22 additions & 10 deletions python/llm/src/ipex_llm/transformers/npu_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,8 @@ def from_pretrained(cls, *args, **kwargs):

logger.info(f"Converting model, it may takes up to several minutes ...")

model.config.update({"optimize_model": optimize_model})

if mock_device == "cpu":
with torch.no_grad():
# Only mock quantization_group_size=0 for now
Expand Down Expand Up @@ -262,7 +264,6 @@ def optimize_npu_model(cls, *args, **kwargs):
transpose_value_cache=transpose_value_cache,
group_size=quantization_group_size
)
model.save_low_bit = types.MethodType(save_low_bit, model)
else:
from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
import convert_llm
Expand All @@ -271,7 +272,7 @@ def optimize_npu_model(cls, *args, **kwargs):
max_prompt_len=max_prompt_len,
transpose_value_cache=transpose_value_cache,
group_size=quantization_group_size)

model.save_low_bit = types.MethodType(save_low_bit, model)
return model

@classmethod
Expand Down Expand Up @@ -304,8 +305,10 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs)
ignore_argument(kwargs, "pipeline_parallel_stages")
ignore_argument(kwargs, "mixed_precision")
ignore_argument(kwargs, "quantization_group_size")
optimize_model = kwargs.pop("optimize_model", False)
max_output_len = kwargs.pop("max_output_len", 1024)
ignore_argument(kwargs, "optimize_model")
pipeline = kwargs.pop("pipeline", False)
max_context_len = kwargs.pop("max_context_len", 1024)
max_context_len = max_context_len - 1
max_prompt_len = kwargs.pop("max_prompt_len", 512)
inter_pp = kwargs.pop("inter_pp", None)
intra_pp = kwargs.pop("intra_pp", None)
Expand Down Expand Up @@ -355,6 +358,7 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs)
bigdl_lcmu_enabled = config_dict.pop("bigdl_lcmu_enabled", True)
mixed_precision = config_dict.pop("mixed_precision", False)
quantization_group_size = config_dict.pop("group_size", 0)
optimize_model = config_dict.pop("optimize_model", False)

invalidInputError(
qtype,
Expand Down Expand Up @@ -450,13 +454,12 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs)
quant_device = "meta" if bigdl_lcmu_enabled else "cpu"
logger.info(f"Converting model, it may takes up to several minutes ...")
from intel_npu_acceleration_library.compiler import create_npu_kernels

if optimize_model:
invalidInputError(
max_prompt_len < max_output_len,
max_prompt_len < max_context_len,
(
f"max_prompt_len ({max_prompt_len}) should be less"
" than max_output_len ({max_output_len})"
" than max_context_len ({max_context_len})"
),
)
from ipex_llm.transformers.npu_models.convert_mp import optimize_llm_pre
Expand All @@ -468,7 +471,8 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs)

with torch.no_grad():
optimize_llm_pre(model, qtype, mixed_precision,
quantization_group_size=quantization_group_size)
quantization_group_size=quantization_group_size,
load=bigdl_lcmu_enabled)
cls.load_convert(qtype, model, quant_device, modules_to_not_convert,
quantization_group_size, *model_args, **kwargs)
create_npu_kernels(llm)
Expand Down Expand Up @@ -541,17 +545,25 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs)
for param in model.parameters():
param.requires_grad_(False)

if optimize_model:
if optimize_model and not pipeline:
from ipex_llm.transformers.npu_models.convert_mp import optimize_llm
optimize_llm(
llm,
max_output_len=max_output_len,
max_output_len=max_context_len,
max_prompt_len=max_prompt_len,
inter_pp=inter_pp,
intra_pp=intra_pp,
transpose_value_cache=transpose_value_cache,
group_size=quantization_group_size
)
elif optimize_model and pipeline:
from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
import convert_llm
convert_llm(llm,
kv_len=max_context_len,
max_prompt_len=max_prompt_len,
transpose_value_cache=transpose_value_cache,
group_size=quantization_group_size)

return model

Expand Down
Loading
Loading