dmlc · sxjscience · Sep 29, 2020 · Sep 27, 2020 · Sep 28, 2020 · Sep 28, 2020
@@ -0,0 +1,76 @@
+SUBWORD_ALGO=$1
+SRC=en
+TGT=de
+SAVE_PATH=wmt2014_ende
+
+# Fetch the raw text
+nlp_data prepare_wmt \
+        --dataset wmt2014 \
+        --lang-pair ${SRC}-${TGT} \
+        --save-path ${SAVE_PATH}
+
+# We use sacrebleu to fetch the dev set (newstest2013) and test set (newstest2014)
+sacrebleu -t wmt13 -l ${SRC}-${TGT} --echo src > ${SAVE_PATH}/dev.raw.${SRC}
+sacrebleu -t wmt13 -l ${SRC}-${TGT} --echo ref > ${SAVE_PATH}/dev.raw.${TGT}
+sacrebleu -t wmt14/full -l ${SRC}-${TGT} --echo src > ${SAVE_PATH}/test.raw.${SRC}
+sacrebleu -t wmt14/full -l ${SRC}-${TGT} --echo ref > ${SAVE_PATH}/test.raw.${TGT}
+
+
+# Clean and tokenize the training + dev corpus
+cd ${SAVE_PATH}
+nlp_process clean_tok_para_corpus --src-lang ${SRC} \
+                      --tgt-lang ${TGT} \
+                      --src-corpus train.raw.${SRC} \
+                      --tgt-corpus train.raw.${TGT} \
+                      --min-num-words 1 \
+                      --max-num-words 100 \
+                      --src-save-path train.tok.${SRC} \
+                      --tgt-save-path train.tok.${TGT}
+
+nlp_process clean_tok_para_corpus --src-lang ${SRC} \
+                      --tgt-lang ${TGT} \
+                      --src-corpus dev.raw.${SRC} \
+                      --tgt-corpus dev.raw.${TGT} \
+                      --min-num-words 1 \
+                      --max-num-words 100 \
+                      --src-save-path dev.tok.${SRC} \
+                      --tgt-save-path dev.tok.${TGT}
+
+# For test corpus, we will just tokenize the data
+nlp_process clean_tok_para_corpus --src-lang ${SRC} \
+                      --tgt-lang ${TGT} \
+                      --src-corpus test.raw.${SRC} \
+                      --tgt-corpus test.raw.${TGT} \
+                      --src-save-path test.tok.${SRC} \
+                      --tgt-save-path test.tok.${TGT}
+
+# Learn BPE with the training data
+nlp_process learn_subword --corpus train.tok.${SRC} train.tok.${TGT} \
+                             --model ${SUBWORD_ALGO} \
+                             --save-dir . \
+                             --vocab-size 32768
+
+# Apply the learned codes to the training set
+for LANG in ${SRC} ${TGT}
+do
+nlp_process apply_subword --model ${SUBWORD_ALGO}\
+                             --output-type subword \
+                             --model-path ${SUBWORD_ALGO}.model \
+                             --vocab-path ${SUBWORD_ALGO}.vocab \
+                             --corpus train.tok.${LANG} \
+                             --save-path train.tok.${SUBWORD_ALGO}.${LANG}
+done
+
+# Apply the learned codes to the dev/test set
+for LANG in ${SRC} ${TGT}
+do
+  for SPLIT in dev test
+  do
+    nlp_process apply_subword --model ${SUBWORD_ALGO} \
+                                 --output-type subword \
+                                 --model-path ${SUBWORD_ALGO}.model \
+                                 --vocab-path ${SUBWORD_ALGO}.vocab \
+                                 --corpus ${SPLIT}.tok.${LANG} \
+                                 --save-path ${SPLIT}.tok.${SUBWORD_ALGO}.${LANG}
+  done
+done
@@ -7,7 +7,8 @@ to generate the dataset. Then, run `train_transformer.py` to train the model.
 In the following, we give the training script for WMT2014 EN-DE task with yttm tokenizer. 
 You may first run the following command in [datasets/machine_translation](../datasets/machine_translation).
 ```bash
-bash ../datasets/machine_translation/wmt2014_ende.sh yttm
+bash ../datasets/machine_translation/wmt2014_ende_base.sh yttm (For transformer_base config)
+bash ../datasets/machine_translation/wmt2014_ende.sh yttm (For transformer_wmt_en_de_big config)
 ```
 
 Then, you can run the experiment.
@@ -31,16 +32,18 @@ python3 train_transformer.py \
     --lr 0.002 \
     --sampler BoundedBudgetSampler \
     --max_num_tokens 2700 \
-    --max_update 15000 \
-    --save_interval_update 500 \
-    --warmup_steps 6000 \
+    --epochs 30 \
+    --warmup_steps 4000 \
     --warmup_init_lr 0.0 \
     --seed 123 \
     --gpus 0,1,2,3
 ```
 
 Or training via horovod
 ```
+SUBWORD_ALGO=subword_nmt
+SRC=en
+TGT=de
 horovodrun -np 4 -H localhost:4 python3 train_transformer.py \
     --comm_backend horovod \
     --train_src_corpus wmt2014_ende/train.tok.${SUBWORD_ALGO}.${SRC} \
@@ -56,28 +59,27 @@ horovodrun -np 4 -H localhost:4 python3 train_transformer.py \
     --lr 0.002 \
     --sampler BoundedBudgetSampler \
     --max_num_tokens 2700 \
-    --max_update 15000 \
-    --save_interval_update 500 \
-    --warmup_steps 6000 \
+    --epochs 30 \
+    --warmup_steps 4000 \
     --warmup_init_lr 0.0 \
     --seed 123 \
     --gpus 0,1,2,3
 ```
 
-Use the average_checkpoint cli to average the last 10 checkpoints
+Use the average_checkpoint cli to average the last 5 checkpoints
 
 ```bash
 gluon_average_checkpoint --checkpoints transformer_base_wmt2014_en_de_${SUBWORD_ALGO}/epoch*.params \
-    --begin 30 \
-    --end 39 \
-    --save-path transformer_base_wmt2014_en_de_${SUBWORD_ALGO}/epoch_avg_30_39.params
+    --begin 25 \
+    --end 29 \
+    --save-path transformer_base_wmt2014_en_de_${SUBWORD_ALGO}/avg_25_29.params
 ```
 
 Use the following command to inference/evaluate the Transformer model:
 
 ```bash
 python3 evaluate_transformer.py \
-    --param_path transformer_base_wmt2014_en_de_${SUBWORD_ALGO}/epoch_avg_30_39.params \
+    --param_path transformer_base_wmt2014_en_de_${SUBWORD_ALGO}/avg_25_29.params \
     --src_lang en \
     --tgt_lang de \
     --cfg transformer_base_wmt2014_en_de_${SUBWORD_ALGO}/config.yml \
@@ -134,7 +136,7 @@ Use the following command to inference/evaluate the Transformer model:
 
 ```bash
 python3 evaluate_transformer.py \
-    --param_path transformer_big_wmt2014_en_de_${SUBWORD_ALGO}/average_21_30.params \
+    --param_path transformer_big_wmt2014_en_de_${SUBWORD_ALGO}/avg_21_30.params \
     --src_lang en \
     --tgt_lang de \
     --cfg transformer_big_wmt2014_en_de_${SUBWORD_ALGO}/config.yml \
@@ -156,7 +158,7 @@ Test BLEU score with 3 seeds (evaluated via sacre BLEU):
 (test bleu / valid bleu)
 | Subword Model | #Params    | Seed = 123  | Seed = 1234 | Seed = 12345 |  Mean±std   |
 |---------------|------------|-------------|-------------|--------------|-------------|
-| yttm          |            | 26.50/26.29 | -           |  -           |  -          |
+| yttm          |            | 26.78/25.96 | -           |  -           |  -          |
 | hf_bpe        |            |  -          | -           |  -           |  -          |
 | spm           |            |  -          | -           |  -           |  -          |
 
@@ -165,6 +167,6 @@ Test BLEU score with 3 seeds (evaluated via sacre BLEU):
 (test bleu / valid bleu)
 | Subword Model | #Params    | Seed = 123  | Seed = 1234 | Seed = 12345 |  Mean±std   |
 |---------------|------------|-------------|-------------|--------------|-------------|
-| yttm          |            | 27.93/26.82 | -           |  -           |  -          |
+| yttm          |            | 27.99/26.84 | -           |  -           |  -          |
 | hf_bpe        |            |  -          | -           |  -           |  -          |
 | spm           |            |  -          | -           |  -           |  -          |