forked from kevinduh/sockeye-recipes
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrs1-rmsprop2.hpm
112 lines (95 loc) · 4.46 KB
/
rs1-rmsprop2.hpm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#####################################################################
# Sockeye-recipes Hyperparameter configuration file #
# #
# Overview: #
# - "workdir" corresponds a group of preprocessed bitext and models #
# for a given dataset. Each "workdir" can contain multiple #
# "datadir" and "modeldir" if desired #
# - "datadir" stores the BPE-preprocessed training and validation #
# bitext files #
# - "modeldir" is generated by Sockeye and stores all training info #
# - "rootdir" is path to your installation of sockeye-recipes, #
# e.g. ~/src/sockeye-recipes #
# #
# preprocess-bpe.sh: #
# - input: Tokenized bitext for training ("train_tok") and #
# and validation ("valid_tok") #
# - output: BPE-preprocessed bitext ("train_bpe", "valid_bpe") #
# and vocabulary ("bpe_vocab_src", "bpe_vocab_trg") #
# - main hyperparameters: number of BPE symbols for source & target #
# #
# train.sh: #
# - input: BPE-preprocessed bitext ("train_bpe", "valid_bpe") #
# - output: "modeldir", which contains all training info and can #
# be used to translate #
# - main hyperparameters: many! see below #
# #
# translate.sh: #
# - input: this hyperparam file, which specifies modeldir #
# - output: resulting target translation of source file #
#####################################################################
#####################################################################
# (0) General settings (to be modified for each project) #
#####################################################################
### User-specified directories ###
workdir=./
modeldir=$workdir/rs1-rmsprop2
rootdir=../../
# DESCRIPTION: rs1: RNN-based seq2seq model, Small
### Language pair (source and target) ###
# Note: We assume all bitext files contain these as suffices.
# e.g. $train_tok.$src, $train_tok.$trg refer to the source and target
src=de
trg=en
#####################################################################
# (1) preprocess-bpe.sh settings (modify if needed) #
#####################################################################
### Number of symbols to use for BPE ###
# Note: we perform source and target BPE separately
# This corresponds to initial source (src) and target (trg) vocab size
bpe_symbols_src=30000
bpe_symbols_trg=30000
### Filename for BPE-processed bitext file ###
# Note: the following default names should be fine for most use cases
datadir=$workdir/data-bpe/
train_bpe_src=$datadir/train.bpe-${bpe_symbols_src}.$src
valid_bpe_src=$datadir/valid.bpe-${bpe_symbols_src}.$src
train_bpe_trg=$datadir/train.bpe-${bpe_symbols_trg}.$trg
valid_bpe_trg=$datadir/valid.bpe-${bpe_symbols_trg}.$trg
### Filename for BPE vocabulary ###
# Note: the following default names should be fine for most use cases
# Note: bpe_vocab_src will be needed for applying BPE to test, in translate.sh
bpe_vocab_src=${train_bpe_src}.bpe_vocab
bpe_vocab_trg=${train_bpe_trg}.bpe_vocab
#####################################################################
# (2) train.sh settings (modify if needed) #
#####################################################################
# Model architecture
num_embed="512:512"
rnn_num_hidden=512
rnn_attention_type="dot"
num_layers=1
rnn_cell_type="lstm"
# Regularization
embed_dropout=".0:.0"
rnn_dropout_inputs=".0:.0"
rnn_dropout_states=".0:.0"
label_smoothing=0.1
# Vocabulary
num_words="${bpe_symbols_src}:${bpe_symbols_trg}"
word_min_count="1:1"
max_seq_len="100:100"
# Training configuration
batch_size=4096
optimizer=rmsprop
initial_learning_rate=0.001
learning_rate_reduce_factor=0.5
loss="cross-entropy"
seed=13
# Logging and stopping condition
checkpoint_frequency=750
min_num_epochs=0
max_num_epochs=100
max_updates=500000
keep_last_params=1
decode_and_evaluate=0