-
Notifications
You must be signed in to change notification settings - Fork 23
/
inference_4_bits.py
34 lines (29 loc) · 1.69 KB
/
inference_4_bits.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import torch
from modelscope.hub.snapshot_download import snapshot_download
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
cache_dir = './sunsimiao/'
model_dir = snapshot_download('baichuan-inc/baichuan-7B',
cache_dir=cache_dir,
revision='v1.0.0')
model_dir_sft = snapshot_download('thomas/Sunsimiao_lora',
cache_dir=cache_dir,
revision='v1.0.0')
tokenizer = AutoTokenizer.from_pretrained(cache_dir +
"baichuan-inc/baichuan-7B",
trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(cache_dir +
"baichuan-inc/baichuan-7B",
device_map="auto",
trust_remote_code=True,
low_cpu_mem_usage=True,
load_in_4bit=True,
torch_dtype=torch.float16)
model = PeftModel.from_pretrained(model, cache_dir + "thomas/Sunsimiao_lora")
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
query = "晚上睡不着怎么办?"
prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
prompt += "### Instruction:\n{}\n\n### Response:\n".format(query)
inputs = tokenizer([prompt], return_tensors="pt")
inputs = inputs.to("cuda")
generate_ids = model.generate(**inputs, max_new_tokens=256, streamer=streamer)