Skip to content

Commit

Permalink
add minicpm-v models to transformers_int4_npu_win api (#12352)
Browse files Browse the repository at this point in the history
* add minicpm npu

* optimize model
  • Loading branch information
JinheTang authored Nov 7, 2024
1 parent a7b6668 commit 79f2877
Showing 1 changed file with 5 additions and 10 deletions.
15 changes: 5 additions & 10 deletions python/llm/dev/benchmark/all-in-one/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,18 +629,13 @@ def transformers_int4_npu_win(repo_id,
# Load model in 4 bit,
# which convert the relevant layers in the model into INT4 format
st = time.perf_counter()
if repo_id in CHATGLM_IDS:
model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True,
optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]),
if repo_id in MINICPM_V_IDS:
model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, optimize_model=optimize_model,
trust_remote_code=True, use_cache=True, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]),
quantization_group_size=npu_group_size, transpose_value_cache=transpose_value_cache,
torch_dtype=torch.float16, attn_implementation="eager").eval()
attn_implementation="eager", modules_to_not_convert=["vpm", "resampler"]).eval()
model = model.llm
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
elif repo_id in LLAMA_IDS:
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, torch_dtype=torch.float16,
optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]),
quantization_group_size=npu_group_size, transpose_value_cache=transpose_value_cache,
use_cache=True, attn_implementation="eager").eval()
tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
else:
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, torch_dtype=torch.float16,
optimize_model=optimize_model, max_context_len=max_context_len, max_prompt_len=int(in_out_len[0]),
Expand Down

0 comments on commit 79f2877

Please sign in to comment.