-
Notifications
You must be signed in to change notification settings - Fork 6
/
config.yaml
92 lines (92 loc) · 2.06 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
data:
datasets_used_per_step: 2
interleaved_datasets: 2
parallel_workers: 2
path: gs://homebrewnlp-eu/the-char-pile/*
prefetch_buffer: 2
seed: 0
shuffle_buffer: 0
vocab_size: 256
dims:
batch: 16
depth: 64
features: 256
heads: 8
inner_bottleneck_features: 128
inner_bottleneck_kernel: 49
intermediate: 512
moe_intermediate: 4096
one: 1
outer_bottleneck_kernel: 25
pointwise_features: 512
pointwise_kernel: 5
sequence: 4096
vocab: 256
global_prefix: ''
model:
activation_std: 0.5893595616022745
computation_dtype: bfloat16
conv_scale: 4.0
conv_shift: 8.0
leaky_relu_slope: 0.02
norm_eps: 1.0e-16
qrnn_frequency: 8
rezero_lr_scale: 0.01
storage_dtype: float32
optimizer:
adam_beta1: 0.03
adam_beta2: 0.003
block_size: 512
bottleneck_scale: 1
epsilon: 1.0e-16
exponential_decay: 3.0e-06
gradient_clip: 0.001
input_scale: 1
learning_rate: 0.01
moe_scale: 1
momentum_beta: 0.1
norm_scale: 1
output_scale: 1
pointwise_scale: 1
preconditioning_compute_steps: 128
qrnn_scale: 1
skip_preconditioning_dim_size_gt: 1024
start_preconditioning_step: 16
statistics_compute_steps: 4
warmup_end: 1024
weight_decay: 0.001
seed: 0
training:
checkpoint_interval: 2048
checkpoint_load_path: ""
checkpoint_path: gs://homebrewnlp-eu/homebrewnlp-checkpoint-deep
device_steps: 1
device_unroll: 1
do_checkpoint: true
early_stopping:
expected_loss:
exponent: -0.3642513
offset: 6.165868
scale: 39.08037
loss_patience: 0.875
maximum_spike_duration: 24
maximum_spike_size: 3
minimum_relative_loss_change: 0.003
print_interval: 1
steps: 65536
trace:
do_trace: false
output_path: trace
start_step: 16
stop_step: 80
z_loss: 0.01
wandb:
entity: homebrewnlp
log_frequency: 1
median_sizes:
- 64
- 256
- 1024
percentile: 25
project: gpt
use_wandb: true