forked from SNU-ARC/any-precision-llm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdemo.py
114 lines (93 loc) · 4.06 KB
/
demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import torch
from any_precision import AnyPrecisionForCausalLM
from transformers import AutoTokenizer, TextStreamer, AutoModelForCausalLM
import logging
import time
from argparse import ArgumentParser
# Logging with time sans date, level name, and message
logging.basicConfig(level=logging.INFO, format='[%(asctime)s | %(levelname)s] %(message)s', datefmt='%H:%M:%S')
parser = ArgumentParser()
parser.add_argument('-p', '--precisions', nargs='+', type=int, default=None,
help="The precisions to benchmark. If not specified, all available precisions will be benchmarked."
)
args = parser.parse_args()
if __name__ == '__main__':
model_path = './cache/packed/anyprec-(Llama-2-7b-chat-hf)-w8_orig3-gc1-c4_s100_blk512'
original_model_path = 'meta-llama/Llama-2-7b-chat-hf'
# Configure the precisions to benchmark
do_fp16 = True
if args.precisions is not None:
precisions = args.precisions
if 16 in precisions:
precisions.remove(16)
else:
do_fp16 = False
else:
precisions = None # Benchmark all available precisions
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
streamer = TextStreamer(tokenizer)
model = AnyPrecisionForCausalLM.from_quantized(model_path, precisions=precisions)
model = model.eval().cuda()
# Warm up CUDA cache for stable performance
print("~~~~~~~ Warming up CUDA cache ~~~~~~~")
input_context = "A CUDA cache warm-up is needed to"
input_ids = tokenizer.encode(input_context, return_tensors="pt").cuda()
output = model.generate(
input_ids,
precision=min(model.precisions),
max_new_tokens=32,
pad_token_id=tokenizer.eos_token_id,
streamer=streamer,
)
print("~~~~~~~ Warm up complete ~~~~~~~\n")
# Now begin bit-width benchmarking
input_context = input("Prompt/Context: ")
input_ids = tokenizer.encode(input_context, return_tensors="pt").cuda()
results = {}
for precision in model.precisions:
print(f"=============== generation with {precision}-bit precision ===============")
torch.cuda.synchronize()
start_time = time.time()
output = model.generate(
input_ids,
precision=precision,
max_new_tokens=256,
pad_token_id=tokenizer.eos_token_id,
streamer=streamer,
)
torch.cuda.synchronize()
end_time = time.time()
# Calculate generation speed
token_count = len(output[0]) - len(input_ids[0])
tokens_per_second = token_count / (end_time - start_time)
ms_per_token = 1 / tokens_per_second * 1000
results[precision] = (tokens_per_second, ms_per_token)
print(f"\n( Generation speed: {tokens_per_second:.1f} tok/s | Latency: {ms_per_token:.2f} ms/tok )\n")
# Clear memory
del model
torch.cuda.empty_cache()
if do_fp16:
# Benchmark the original model
print(f"=============== generation with fp16 precision ===============")
model = AutoModelForCausalLM.from_pretrained(original_model_path, torch_dtype=torch.float16).eval().cuda()
torch.cuda.synchronize()
start_time = time.time()
output = model.generate(
input_ids,
max_length=256,
pad_token_id=tokenizer.eos_token_id,
streamer=streamer,
)
torch.cuda.synchronize()
end_time = time.time()
# Calculate generation speed
token_count = len(output[0]) - len(input_ids[0])
tokens_per_second = token_count / (end_time - start_time)
ms_per_token = 1 / tokens_per_second * 1000
results[16] = (tokens_per_second, ms_per_token)
print(f"\n( Generation speed: {tokens_per_second:.1f} tok/s | Latency: {ms_per_token:.2f} ms/tok )\n")
print("=============== Summary ===============")
print(f"\nModel: {model_path}\n")
for precision, (tokens_per_second, ms_per_token) in results.items():
print(f"{precision}-bit: {tokens_per_second:.1f} tok/s | {ms_per_token:.2f} ms/tok")