-
Notifications
You must be signed in to change notification settings - Fork 7
/
config_model.py
60 lines (53 loc) · 1.87 KB
/
config_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import texar.tf as tx
beam_width = 5
hidden_dim = 768
bert = {
'pretrained_model_name': 'bert-base-uncased'
}
# See https://texar.readthedocs.io/en/latest/code/modules.html#texar.tf.modules.BERTEncoder.default_hparams
bert_encoder = {}
# From https://github.com/asyml/texar/blob/413e07f859acbbee979f274b52942edd57b335c1/examples/transformer/config_model.py#L27-L45
# with adjustments for BERT
decoder = {
'dim': hidden_dim,
'num_blocks': 6,
'multihead_attention': {
'num_heads': 8,
'output_dim': hidden_dim
},
'initializer': {
'type': 'variance_scaling_initializer',
'kwargs': {
'scale': 1.0,
'mode': 'fan_avg',
'distribution': 'uniform',
},
},
'poswise_feedforward': tx.modules.default_transformer_poswise_net_hparams(output_dim=hidden_dim)
}
loss_label_confidence = 0.9
opt = {
'optimizer': {
'type': 'AdamOptimizer',
'kwargs': {
'beta1': 0.9,
'beta2': 0.997,
'epsilon': 1e-9
}
}
}
lr = {
# The 'learning_rate_schedule' can have the following 3 values:
# - 'static' -> A simple static learning rate, specified by 'static_lr'
# - 'aiayn' -> The learning rate used in the "Attention is all you need" paper.
# - 'constant.linear_warmup.rsqrt_decay.rsqrt_depth' -> The learning rate for Texar's Transformer example
'learning_rate_schedule': 'aiayn',
# The learning rate constant used for the 'constant.linear_warmup.rsqrt_decay.rsqrt_depth' learning rate
'lr_constant': 2 * (hidden_dim ** -0.5),
# The warmup steps for the 'aiayn' and 'constant.linear_warmup.rsqrt_decay.rsqrt_depth' learning rate
'warmup_steps': 4000,
# The static learning rate, when 'static' is used.
'static_lr': 1e-3,
# A multiplier that can be applied to the 'aiayn' learning rate.
'aiayn_multiplier': 0.2
}