This repository has been archived by the owner on Jan 12, 2024. It is now read-only.
forked from simran-arora/cs229s-nanoGPT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_experiments.py
229 lines (206 loc) · 8.8 KB
/
run_experiments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
"""
A much shorter version of train.py for benchmarking
"""
import os
from contextlib import nullcontext
import numpy as np
import time
import torch
import json
from optimization.model import OptimizationConfig, GPT
import tiktoken
from tqdm import tqdm
# -----------------------------------------------------------------------------
result = dict()
# -----------------------------------------------------------------------------
batch_size = 8
block_size = 1024
bias = False
real_data = True
seed = 1337
gradient_accumulation_steps = 40
model_type = 'gpt2' # examples: 'gpt2', 'gpt2-medium'
device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
compile = False # use PyTorch 2.0 to compile the model to be faster
profile = False # use pytorch profiler, or just simple benchmarking?
exec(open('configurator.py').read()) # overrides from command line or config file
# -----------------------------------------------------------------------------
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
# data loading init
if real_data:
dataset = 'wikitext'
data_dir = os.path.join('data', dataset)
train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
val_data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')
start = 0
def get_batch(split="train"):
global start
if split == "train":
data = train_data
elif split == "val":
data = val_data
else:
raise ValueError(f"Invalid split: {split}")
if split == "train":
ix = torch.randint(len(data) - block_size, (batch_size,))
elif split == "val":
ix = torch.arange(start, start + batch_size * block_size, block_size)
start += batch_size * block_size
else:
raise ValueError(f"Invalid split: {split}")
x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
return x, y
else:
# alternatively, if fixed data is desired to not care about data loading
x = torch.randint(50304, (batch_size, block_size), device=device)
y = torch.randint(50304, (batch_size, block_size), device=device)
get_batch = lambda split: (x, y)
# model init
# gptconf = GPTConfig(
# block_size = block_size, # how far back does the model look? i.e. context size
# n_layer = 12, n_head = 12, n_embd = 768, # size of the model
# dropout = 0, # for determinism
# bias = bias,
# )
# gptconf = GPTConfig(
# block_size = block_size, # how far back does the model look? i.e. context size
# n_layer = 24, n_head = 16, n_embd = 1024, # size of the model
# dropout = 0, # for determinism
# bias = bias,
# )
# model = GPT(gptconf)
model = GPT.from_pretrained(model_type)
model.to(device)
optimizer = model.configure_optimizers(weight_decay=1e-2, learning_rate=1e-4, betas=(0.9, 0.95), device_type=device_type)
if compile:
print("Compiling model...")
model = torch.compile(model) # pytorch 2.0
print("---------------------------------------------------------------------------------")
print("First task: train the model and get validation loss")
if profile:
# useful docs on pytorch profiler:
# - tutorial https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html
# - api https://pytorch.org/docs/stable/profiler.html#torch.profiler.profile
wait, warmup, active = 5, 5, 500
num_steps = wait + warmup + active
with torch.profiler.profile(
activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
schedule=torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=1),
on_trace_ready=torch.profiler.tensorboard_trace_handler('./bench_log'),
record_shapes=False,
profile_memory=False,
with_stack=False, # incurs an additional overhead, disable if not needed
with_flops=True,
with_modules=False, # only for torchscript models atm
) as prof:
X, Y = get_batch('train')
for k in range(num_steps):
with ctx:
logits, loss = model(X, Y)
X, Y = get_batch('train')
optimizer.zero_grad(set_to_none=True)
loss.backward()
optimizer.step()
lossf = loss.item()
print(f"{k}/{num_steps} loss: {lossf:.4f}")
prof.step() # notify the profiler at end of each step
else:
# simple benchmarking
torch.cuda.synchronize()
for stage, num_steps in enumerate([10, 500]): # burnin, then benchmark
t0 = time.time()
for k in tqdm(range(num_steps)):
optimizer.zero_grad(set_to_none=True)
for _ in range(gradient_accumulation_steps):
X, Y = get_batch('train')
with ctx:
logits, loss = model(X, Y)
loss.backward()
optimizer.step()
lossf = loss.item()
print(f"{k}/{num_steps} loss: {lossf:.4f}")
torch.cuda.synchronize()
t1 = time.time()
dt = t1-t0
mfu = model.estimate_mfu(batch_size * 1 * num_steps, dt)
if stage == 1:
print(f"time per iteration: {dt/num_steps*1000:.4f}ms, MFU: {mfu*100:.2f}%")
# validation loss
val_steps = (len(val_data)-1) // (batch_size * block_size) - 1
batch_loss = []
model.eval()
with torch.no_grad():
start = 0
for k in tqdm(range(val_steps)):
X, Y = get_batch('val')
with ctx:
logits, loss = model(X, Y)
lossf = loss.item()
print(f"{k}/{val_steps} validation loss: {lossf:.4f}")
batch_loss.append(lossf)
val_loss = np.mean(batch_loss)
print(f"Validation loss: {val_loss:.4f}")
result['loss'] = val_loss
print("------------------------------------------------------------------------------------")
print("Second task: inference latency")
enc = tiktoken.get_encoding("gpt2")
for batch in [1, 12]:
model = GPT.from_pretrained(model_type, override_args={
'oconfig': OptimizationConfig(inference_batch_size=batch)
})
model.to(device)
model.to(ptdtype)
prompt = torch.tensor(enc.encode("hello", allowed_special={"<|endoftext|>"}), dtype=torch.long, device=device).unsqueeze(0)
prompt = prompt.repeat(batch, 1)
times = []
for stage, num_steps in enumerate([10, 10]):
for i in tqdm(range(num_steps)):
torch.cuda.synchronize()
torch.manual_seed(i + seed)
t = time.time()
y = model.generate_kv(prompt, 128, 1.0, 1)
if stage == 1:
times.append(batch * 128.0 / (time.time() - t))
print(f"inference_latency_{batch}", np.mean(times))
result[f"inference_latency_{batch}"] = np.mean(times)
print("-----------------------------------------------------------------------------")
print("Third task: training throughput")
model.train()
for batch_size in [4, 12]:
times = []
for stage, num_steps in enumerate([2, 10]): # burnin, then benchmark
for k in tqdm(range(num_steps)):
torch.cuda.synchronize()
t0 = time.time()
optimizer.zero_grad(set_to_none=True)
for _ in range(gradient_accumulation_steps):
X, Y = get_batch('train')
with ctx:
logits, loss = model(X, Y)
loss.backward()
optimizer.step()
t1 = time.time()
dt = t1-t0
if stage == 1:
times.append(dt)
mfu = model.estimate_mfu(batch_size * 1 * num_steps, dt)
if stage == 1:
print(f"time per iteration: {dt/num_steps*1000:.4f}ms, MFU: {mfu*100:.2f}%")
print(f"training_throughput_{batch_size}", batch_size * block_size * gradient_accumulation_steps / np.mean(times))
result[f"training_throughput_{batch_size}"] = batch_size * block_size * gradient_accumulation_steps / np.mean(times)
# now batch size should be 12
batch_size = 12
# -----------------------------------------------------------------------------------
with open('results.json', 'w') as f:
print("save results to results.json")
print(result)
json.dump(result, f)