From e8114a759c558b584fb3633ece9ba7e7714d7892 Mon Sep 17 00:00:00 2001 From: Hu Date: Sun, 27 Sep 2020 20:05:34 +0800 Subject: [PATCH 1/8] update --- .../machine_translation/wmt2014_ende_base.sh | 76 +++++++++ scripts/machine_translation/README.md | 32 ++-- .../wmt2014_back_translation.sh | 154 +++++++++--------- src/gluonnlp/cli/average_checkpoint.py | 2 +- 4 files changed, 172 insertions(+), 92 deletions(-) create mode 100644 scripts/datasets/machine_translation/wmt2014_ende_base.sh diff --git a/scripts/datasets/machine_translation/wmt2014_ende_base.sh b/scripts/datasets/machine_translation/wmt2014_ende_base.sh new file mode 100644 index 0000000000..13cfc0c7da --- /dev/null +++ b/scripts/datasets/machine_translation/wmt2014_ende_base.sh @@ -0,0 +1,76 @@ +SUBWORD_ALGO=$1 +SRC=en +TGT=de +SAVE_PATH=wmt2014_ende + +# Fetch the raw text +nlp_data prepare_wmt \ + --dataset wmt2014 \ + --lang-pair ${SRC}-${TGT} \ + --save-path ${SAVE_PATH} + +# We use sacrebleu to fetch the dev set (newstest2013) and test set (newstest2014) +sacrebleu -t wmt13 -l ${SRC}-${TGT} --echo src > ${SAVE_PATH}/dev.raw.${SRC} +sacrebleu -t wmt13 -l ${SRC}-${TGT} --echo ref > ${SAVE_PATH}/dev.raw.${TGT} +sacrebleu -t wmt14/full -l ${SRC}-${TGT} --echo src > ${SAVE_PATH}/test.raw.${SRC} +sacrebleu -t wmt14/full -l ${SRC}-${TGT} --echo ref > ${SAVE_PATH}/test.raw.${TGT} + + +# Clean and tokenize the training + dev corpus +cd ${SAVE_PATH} +nlp_process clean_tok_para_corpus --src-lang ${SRC} \ + --tgt-lang ${TGT} \ + --src-corpus train.raw.${SRC} \ + --tgt-corpus train.raw.${TGT} \ + --min-num-words 1 \ + --max-num-words 100 \ + --src-save-path train.tok.${SRC} \ + --tgt-save-path train.tok.${TGT} + +nlp_process clean_tok_para_corpus --src-lang ${SRC} \ + --tgt-lang ${TGT} \ + --src-corpus dev.raw.${SRC} \ + --tgt-corpus dev.raw.${TGT} \ + --min-num-words 1 \ + --max-num-words 100 \ + --src-save-path dev.tok.${SRC} \ + --tgt-save-path dev.tok.${TGT} + +# For test corpus, we will just tokenize the data +nlp_process clean_tok_para_corpus --src-lang ${SRC} \ + --tgt-lang ${TGT} \ + --src-corpus test.raw.${SRC} \ + --tgt-corpus test.raw.${TGT} \ + --src-save-path test.tok.${SRC} \ + --tgt-save-path test.tok.${TGT} + +# Learn BPE with the training data +nlp_process learn_subword --corpus train.tok.${SRC} train.tok.${TGT} \ + --model ${SUBWORD_ALGO} \ + --save-dir . \ + --vocab-size 32768 + +# Apply the learned codes to the training set +for LANG in ${SRC} ${TGT} +do +nlp_process apply_subword --model ${SUBWORD_ALGO}\ + --output-type subword \ + --model-path ${SUBWORD_ALGO}.model \ + --vocab-path ${SUBWORD_ALGO}.vocab \ + --corpus train.tok.${LANG} \ + --save-path train.tok.${SUBWORD_ALGO}.${LANG} +done + +# Apply the learned codes to the dev/test set +for LANG in ${SRC} ${TGT} +do + for SPLIT in dev test + do + nlp_process apply_subword --model ${SUBWORD_ALGO} \ + --output-type subword \ + --model-path ${SUBWORD_ALGO}.model \ + --vocab-path ${SUBWORD_ALGO}.vocab \ + --corpus ${SPLIT}.tok.${LANG} \ + --save-path ${SPLIT}.tok.${SUBWORD_ALGO}.${LANG} + done +done diff --git a/scripts/machine_translation/README.md b/scripts/machine_translation/README.md index 9164e4b0d5..f94487b6c2 100644 --- a/scripts/machine_translation/README.md +++ b/scripts/machine_translation/README.md @@ -7,7 +7,8 @@ to generate the dataset. Then, run `train_transformer.py` to train the model. In the following, we give the training script for WMT2014 EN-DE task with yttm tokenizer. You may first run the following command in [datasets/machine_translation](../datasets/machine_translation). ```bash -bash ../datasets/machine_translation/wmt2014_ende.sh yttm +bash ../datasets/machine_translation/wmt2014_ende_base.sh yttm (For transformer_base config) +bash ../datasets/machine_translation/wmt2014_ende.sh yttm (For transformer_wmt_en_de_big config) ``` Then, you can run the experiment. @@ -31,9 +32,8 @@ python3 train_transformer.py \ --lr 0.002 \ --sampler BoundedBudgetSampler \ --max_num_tokens 2700 \ - --max_update 15000 \ - --save_interval_update 500 \ - --warmup_steps 6000 \ + --epochs 30 \ + --warmup_steps 4000 \ --warmup_init_lr 0.0 \ --seed 123 \ --gpus 0,1,2,3 @@ -41,6 +41,9 @@ python3 train_transformer.py \ Or training via horovod ``` +SUBWORD_ALGO=subword_nmt +SRC=en +TGT=de horovodrun -np 4 -H localhost:4 python3 train_transformer.py \ --comm_backend horovod \ --train_src_corpus wmt2014_ende/train.tok.${SUBWORD_ALGO}.${SRC} \ @@ -56,28 +59,27 @@ horovodrun -np 4 -H localhost:4 python3 train_transformer.py \ --lr 0.002 \ --sampler BoundedBudgetSampler \ --max_num_tokens 2700 \ - --max_update 15000 \ - --save_interval_update 500 \ - --warmup_steps 6000 \ + --epochs 30 \ + --warmup_steps 4000 \ --warmup_init_lr 0.0 \ --seed 123 \ --gpus 0,1,2,3 ``` -Use the average_checkpoint cli to average the last 10 checkpoints +Use the average_checkpoint cli to average the last 5 checkpoints ```bash gluon_average_checkpoint --checkpoints transformer_base_wmt2014_en_de_${SUBWORD_ALGO}/epoch*.params \ - --begin 30 \ - --end 39 \ - --save-path transformer_base_wmt2014_en_de_${SUBWORD_ALGO}/epoch_avg_30_39.params + --begin 25 \ + --end 29 \ + --save-path transformer_base_wmt2014_en_de_${SUBWORD_ALGO}/avg_25_29.params ``` Use the following command to inference/evaluate the Transformer model: ```bash python3 evaluate_transformer.py \ - --param_path transformer_base_wmt2014_en_de_${SUBWORD_ALGO}/epoch_avg_30_39.params \ + --param_path transformer_base_wmt2014_en_de_${SUBWORD_ALGO}/avg_25_29.params \ --src_lang en \ --tgt_lang de \ --cfg transformer_base_wmt2014_en_de_${SUBWORD_ALGO}/config.yml \ @@ -134,7 +136,7 @@ Use the following command to inference/evaluate the Transformer model: ```bash python3 evaluate_transformer.py \ - --param_path transformer_big_wmt2014_en_de_${SUBWORD_ALGO}/average_21_30.params \ + --param_path transformer_big_wmt2014_en_de_${SUBWORD_ALGO}/avg_21_30.params \ --src_lang en \ --tgt_lang de \ --cfg transformer_big_wmt2014_en_de_${SUBWORD_ALGO}/config.yml \ @@ -156,7 +158,7 @@ Test BLEU score with 3 seeds (evaluated via sacre BLEU): (test bleu / valid bleu) | Subword Model | #Params | Seed = 123 | Seed = 1234 | Seed = 12345 | Mean±std | |---------------|------------|-------------|-------------|--------------|-------------| -| yttm | | 26.50/26.29 | - | - | - | +| yttm | | 26.78/25.96 | - | - | - | | hf_bpe | | - | - | - | - | | spm | | - | - | - | - | @@ -165,6 +167,6 @@ Test BLEU score with 3 seeds (evaluated via sacre BLEU): (test bleu / valid bleu) | Subword Model | #Params | Seed = 123 | Seed = 1234 | Seed = 12345 | Mean±std | |---------------|------------|-------------|-------------|--------------|-------------| -| yttm | | 27.93/26.82 | - | - | - | +| yttm | | 27.99/26.84 | - | - | - | | hf_bpe | | - | - | - | - | | spm | | - | - | - | - | diff --git a/scripts/machine_translation/wmt2014_back_translation.sh b/scripts/machine_translation/wmt2014_back_translation.sh index 6e4c91e6c4..caac6c4321 100644 --- a/scripts/machine_translation/wmt2014_back_translation.sh +++ b/scripts/machine_translation/wmt2014_back_translation.sh @@ -3,47 +3,55 @@ SRC=en TGT=de # prepare en_de data for the reverse model -cd ../datasets/machine_translation -bash wmt2014_ende.sh ${SUBWORD_ALGO} +bash ../datasets/machine_translation/wmt2014_ende.sh ${SUBWORD_ALGO} # Fetch the raw mono text nlp_data prepare_wmt \ - --mono \ - --mono_lang ${TGT} \ - --dataset newscrawl \ - --save-path wmt2014_mono + --mono \ + --mono_lang ${TGT} \ + --dataset newscrawl \ + --save-path wmt2014_mono # Clean and tokenize the monolingual corpus cd wmt2014_mono nlp_process clean_tok_mono_corpus \ - --lang ${TGT} \ - --corpus train.raw.${TGT} \ - --min-num-words 1 \ - --max-num-words 100 \ - --save-path train.tok.${TGT} + --lang ${TGT} \ + --corpus train.raw.${TGT} \ + --min-num-words 1 \ + --max-num-words 100 \ + --save-path train.tok.${TGT} -cd ../../../machine_translation -datapath=../datasets/machine_translation # train the reverse model to translate German to English python3 train_transformer.py \ - --train_src_corpus ${datapath}/wmt2014_ende/train.tok.${SUBWORD_ALGO}.${TGT} \ - --train_tgt_corpus ${datapath}/wmt2014_ende/train.tok.${SUBWORD_ALGO}.${SRC} \ - --dev_src_corpus ${datapath}/wmt2014_ende/dev.tok.${SUBWORD_ALGO}.${TGT} \ - --dev_tgt_corpus ${datapath}/wmt2014_ende/dev.tok.${SUBWORD_ALGO}.${SRC} \ - --src_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \ - --src_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \ - --tgt_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \ - --tgt_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \ - --save_dir transformer_wmt2014_de_en_${SUBWORD_ALGO} \ - --cfg transformer_base \ - --lr 0.002 \ + --train_src_corpus wmt2014_ende/train.tok.${SUBWORD_ALGO}.${TGT} \ + --train_tgt_corpus wmt2014_ende/train.tok.${SUBWORD_ALGO}.${SRC} \ + --dev_src_corpus wmt2014_ende/dev.tok.${SUBWORD_ALGO}.${TGT} \ + --dev_tgt_corpus wmt2014_ende/dev.tok.${SUBWORD_ALGO}.${SRC} \ + --src_subword_model_path wmt2014_ende/${SUBWORD_ALGO}.model \ + --src_vocab_path wmt2014_ende/${SUBWORD_ALGO}.vocab \ + --tgt_subword_model_path wmt2014_ende/${SUBWORD_ALGO}.model \ + --tgt_vocab_path wmt2014_ende/${SUBWORD_ALGO}.vocab \ + --save_dir transformer_big_wmt2014_de_en_${SUBWORD_ALGO} \ + --cfg transformer_wmt_en_de_big \ + --lr 0.001 \ + --sampler BoundedBudgetSampler \ + --max_num_tokens 3584 \ + --max_update 15000 \ --warmup_steps 4000 \ --warmup_init_lr 0.0 \ - --seed 100 \ + --seed 123 \ --gpus 0,1,2,3 +Average the last 10 checkpoints + +gluon_average_checkpoint --checkpoints transformer_big_wmt2014_de_en_${SUBWORD_ALGO}/update*.params \ + --begin 21 \ + --end 30 \ + --save-path transformer_big_wmt2014_de_en_${SUBWORD_ALGO}/avg.params + + # Due to the limited memory, we need to split the data and process the data divided respectively split -l 400000 ${datapath}/wmt2014_mono/train.tok.${TGT} ${datapath}/wmt2014_mono/train.tok.${TGT}.split -d -a 3 @@ -52,7 +60,7 @@ split -l 400000 ${datapath}/wmt2014_mono/train.tok.${TGT} ${datapath}/wmt2014_mo GPUS=(0 1 2 3) IDX=0 for NUM in ` seq -f %03g 0 193 `; do - split_corpus=${datapath}/wmt2014_mono/train.tok.${TGT}.split${NUM} + split_corpus=wmt2014_mono/train.tok.${TGT}.split${NUM} if [ ${IDX} -eq ${#GPUS[@]} ]; then let "IDX=0" wait @@ -60,16 +68,16 @@ for NUM in ` seq -f %03g 0 193 `; do { echo processing ${split_corpus} python3 evaluate_transformer.py \ - --param_path transformer_wmt2014_de_en_${SUBWORD_ALGO}/average.params \ + --param_path transformer_big_wmt2014_de_en_${SUBWORD_ALGO}/avg.params \ --src_lang ${TGT} \ --tgt_lang ${SRC} \ --cfg transformer_base \ --src_tokenizer ${SUBWORD_ALGO} \ --tgt_tokenizer ${SUBWORD_ALGO} \ - --src_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \ - --tgt_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \ - --src_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \ - --tgt_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \ + --src_subword_model_path wmt2014_ende/${SUBWORD_ALGO}.model \ + --tgt_subword_model_path wmt2014_ende/${SUBWORD_ALGO}.model \ + --src_vocab_path wmt2014_ende/${SUBWORD_ALGO}.vocab \ + --tgt_vocab_path wmt2014_ende/${SUBWORD_ALGO}.vocab \ --src_corpus ${split_corpus} \ --save_dir ${split_corpus/.${TGT}./.${SRC}.} \ --beam-size 1 \ @@ -80,81 +88,75 @@ for NUM in ` seq -f %03g 0 193 `; do done wait -cat ` seq -f "${datapath}/wmt2014_mono/train.tok.${SRC}.split%03g/pred_sentences.txt" 0 193 ` \ - > ${datapath}/wmt2014_mono/syn.train.raw.${SRC} -cp ${datapath}/wmt2014_mono/train.tok.${TGT} ${datapath}/wmt2014_mono/syn.train.raw.${TGT} +cat ` seq -f "wmt2014_mono/train.tok.${SRC}.split%03g/pred_sentences.txt" 0 193 ` \ + > wmt2014_mono/syn.train.raw.${SRC} +cp wmt2014_mono/train.tok.${TGT} wmt2014_mono/syn.train.raw.${TGT} # Clean the synthetic data nlp_process clean_tok_para_corpus --src-lang ${SRC} \ --tgt-lang ${TGT} \ - --src-corpus ${datapath}/wmt2014_mono/syn.train.raw.${SRC} \ - --tgt-corpus ${datapath}/wmt2014_mono/syn.train.raw.${TGT} \ + --src-corpus wmt2014_mono/syn.train.raw.${SRC} \ + --tgt-corpus wmt2014_mono/syn.train.raw.${TGT} \ --min-num-words 1 \ - --max-num-words 250 \ + --max-num-words 100 \ --max-ratio 1.5 \ - --src-save-path ${datapath}/wmt2014_mono/syn.train.tok.${SRC} \ - --tgt-save-path ${datapath}/wmt2014_mono/syn.train.tok.${TGT} + --src-save-path wmt2014_mono/syn.train.tok.${SRC} \ + --tgt-save-path wmt2014_mono/syn.train.tok.${TGT} # Combine the synthetic data with upsampled original data # TODO upsample -rm -rf ${datapath}/wmt2014_backtranslation -mkdir ${datapath}/wmt2014_backtranslation +rm -rf wmt2014_backtranslation +mkdir wmt2014_backtranslation for LANG in ${SRC} ${TGT} ; do - cat ${datapath}/wmt2014_ende/train.tok.${LANG} ${datapath}/wmt2014_mono/syn.train.tok.${LANG} \ - > ${datapath}/wmt2014_backtranslation/bt.train.tok.${LANG} + cat wmt2014_ende/train.tok.${LANG} wmt2014_mono/syn.train.tok.${LANG} \ + > wmt2014_backtranslation/bt.train.tok.${LANG} done # Tokenize for LANG in ${SRC} ${TGT} ; do nlp_process apply_subword --model ${SUBWORD_ALGO} \ --output-type subword \ - --model-path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \ - --vocab-path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \ - --corpus ${datapath}/wmt2014_backtranslation/bt.train.tok.${LANG} \ - --save-path ${datapath}/wmt2014_backtranslation/bt.train.tok.${SUBWORD_ALGO}.${LANG} + --model-path wmt2014_ende/${SUBWORD_ALGO}.model \ + --vocab-path wmt2014_ende/${SUBWORD_ALGO}.vocab \ + --corpus wmt2014_backtranslation/bt.train.tok.${LANG} \ + --save-path wmt2014_backtranslation/bt.train.tok.${SUBWORD_ALGO}.${LANG} done # Use the combine data to train the new model python3 train_transformer.py \ - --train_src_corpus ${datapath}/wmt2014_backtranslation/bt.train.tok.${SUBWORD_ALGO}.${SRC} \ - --train_tgt_corpus ${datapath}/wmt2014_backtranslation/bt.train.tok.${SUBWORD_ALGO}.${TGT} \ - --dev_src_corpus ${datapath}/wmt2014_ende/dev.tok.${SUBWORD_ALGO}.${SRC} \ - --dev_tgt_corpus ${datapath}/wmt2014_ende/dev.tok.${SUBWORD_ALGO}.${TGT} \ - --src_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \ - --src_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \ - --tgt_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \ - --tgt_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \ - --save_dir backtranslation_transformer_wmt2014_ende_${SUBWORD_ALGO} \ - --cfg transformer_base \ - --lr 0.003 \ - --max_num_tokens 4096 \ + --train_src_corpus wmt2014_backtranslation/bt.train.tok.${SUBWORD_ALGO}.${SRC} \ + --train_tgt_corpus wmt2014_backtranslation/bt.train.tok.${SUBWORD_ALGO}.${TGT} \ + --dev_src_corpus wmt2014_ende/dev.tok.${SUBWORD_ALGO}.${SRC} \ + --dev_tgt_corpus wmt2014_ende/dev.tok.${SUBWORD_ALGO}.${TGT} \ + --src_subword_model_path wmt2014_ende/${SUBWORD_ALGO}.model \ + --src_vocab_path wmt2014_ende/${SUBWORD_ALGO}.vocab \ + --tgt_subword_model_path wmt2014_ende/${SUBWORD_ALGO}.model \ + --tgt_vocab_path wmt2014_ende/${SUBWORD_ALGO}.vocab \ + --save_dir backtranslation_transformer_big_wmt2014_ende_${SUBWORD_ALGO} \ + --cfg transformer_wmt_en_de_big \ + --lr 0.0007 \ --sampler BoundedBudgetSampler \ - --comm_backend horovod \ - --max_update 30000 \ + --max_num_tokens 3584 \ + --warmup_steps 4000 \ + --max_update 100000 \ --save_interval_update 1000 \ - --warmup_steps 6000 \ --warmup_init_lr 0.0 \ - --num_averages -1 \ --seed 123 \ --gpus 0,1,2,3 -# TODO nlp_average_checkpoint -nlp_nmt average_checkpoint --prefix range() \ - --suffix \ - --save-path backtranslation_transformer_wmt2014_ende_${SUBWORD_ALGO}/average.params +# avg the checkpoints # Finally, we can evaluate the model python3 evaluate_transformer.py \ - --param_path backtranslation_transformer_wmt2014_ende_${SUBWORD_ALGO}/avg_20_29.params \ + --param_path backtranslation_transformer_big_wmt2014_ende_${SUBWORD_ALGO}/avg.params \ --src_lang ${SRC} \ --tgt_lang ${TGT} \ - --cfg transformer_base \ + --cfg backtranslation_transformer_big_wmt2014_ende_${SUBWORD_ALGO}/config.yml \ --src_tokenizer ${SUBWORD_ALGO} \ --tgt_tokenizer ${SUBWORD_ALGO} \ - --src_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \ - --tgt_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \ - --src_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \ - --tgt_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \ - --src_corpus ${datapath}/wmt2014_ende/test.raw.${SRC} \ - --tgt_corpus ${datapath}/wmt2014_ende/test.raw.${TGT} \ - --gpus 0 + --src_subword_model_path wmt2014_ende/${SUBWORD_ALGO}.model \ + --tgt_subword_model_path wmt2014_ende/${SUBWORD_ALGO}.model \ + --src_vocab_path wmt2014_ende/${SUBWORD_ALGO}.vocab \ + --tgt_vocab_path wmt2014_ende/${SUBWORD_ALGO}.vocab \ + --src_corpus wmt2014_ende/test.raw.${SRC} \ + --tgt_corpus wmt2014_ende/test.raw.${TGT} diff --git a/src/gluonnlp/cli/average_checkpoint.py b/src/gluonnlp/cli/average_checkpoint.py index 5a5fff74bd..5c0345694c 100644 --- a/src/gluonnlp/cli/average_checkpoint.py +++ b/src/gluonnlp/cli/average_checkpoint.py @@ -49,7 +49,7 @@ def main(args): for key in keys: res[key] += ckpt[key] for key in keys: - res[key] /= len(args.range) + res[key] /= len(ckpt_paths) mx.npx.save(args.save_path, res) From 179aaf3b79081095d77123cdf20593e358c26414 Mon Sep 17 00:00:00 2001 From: Hu Date: Mon, 28 Sep 2020 12:51:21 +0800 Subject: [PATCH 2/8] update --- scripts/machine_translation/README.md | 2 +- .../wmt2014_back_translation.sh | 3 +- src/gluonnlp/cli/average_checkpoint.py | 2 +- tests/process_cli/test_average_checkpoint.py | 40 +++++++++++++++++++ 4 files changed, 43 insertions(+), 4 deletions(-) create mode 100644 tests/process_cli/test_average_checkpoint.py diff --git a/scripts/machine_translation/README.md b/scripts/machine_translation/README.md index f94487b6c2..5b487151ea 100644 --- a/scripts/machine_translation/README.md +++ b/scripts/machine_translation/README.md @@ -41,7 +41,7 @@ python3 train_transformer.py \ Or training via horovod ``` -SUBWORD_ALGO=subword_nmt +SUBWORD_ALGO=yttm SRC=en TGT=de horovodrun -np 4 -H localhost:4 python3 train_transformer.py \ diff --git a/scripts/machine_translation/wmt2014_back_translation.sh b/scripts/machine_translation/wmt2014_back_translation.sh index caac6c4321..f459425a76 100644 --- a/scripts/machine_translation/wmt2014_back_translation.sh +++ b/scripts/machine_translation/wmt2014_back_translation.sh @@ -105,8 +105,7 @@ nlp_process clean_tok_para_corpus --src-lang ${SRC} \ # Combine the synthetic data with upsampled original data # TODO upsample -rm -rf wmt2014_backtranslation -mkdir wmt2014_backtranslation +mkdir -p wmt2014_backtranslation for LANG in ${SRC} ${TGT} ; do cat wmt2014_ende/train.tok.${LANG} wmt2014_mono/syn.train.tok.${LANG} \ > wmt2014_backtranslation/bt.train.tok.${LANG} diff --git a/src/gluonnlp/cli/average_checkpoint.py b/src/gluonnlp/cli/average_checkpoint.py index 5c0345694c..a1633ffe37 100644 --- a/src/gluonnlp/cli/average_checkpoint.py +++ b/src/gluonnlp/cli/average_checkpoint.py @@ -30,7 +30,7 @@ def main(args): elif ckpt_updates_regexp.fullmatch(ckpt_path) is not None: ckpt_regexp = ckpt_updates_regexp else: - raise Exception('Wrong checkpoints path format') + raise Exception('Wrong checkpoints path format: {}'.format(ckpt_path)) ckpt_paths = [] for path in args.checkpoints: diff --git a/tests/process_cli/test_average_checkpoint.py b/tests/process_cli/test_average_checkpoint.py new file mode 100644 index 0000000000..092cc4dba0 --- /dev/null +++ b/tests/process_cli/test_average_checkpoint.py @@ -0,0 +1,40 @@ +import os +from gluonnlp.cli import average_checkpoint +from mxnet.gluon import nn +from numpy.testing import assert_allclose + +_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__))) + +def test_avg_ckpt(): + num_ckpts = 5 + model = nn.Dense(units=10, in_units=10) + model.initialize() + params = model.collect_params() + gd_avg = {} + for key in params.keys(): + gd_avg[key] = params[key].data().asnumpy() + model.save_parameters(os.path.join(_CURR_DIR, 'update0.params')) + + for i in range(1, num_ckpts): + model.initialize(force_reinit=True) + params = model.collect_params() + for key in gd_avg.keys(): + gd_avg[key] += params[key].data().asnumpy() + model.save_parameters(os.path.join(_CURR_DIR, 'update{}.params'.format(i))) + + for key in gd_avg.keys(): + gd_avg[key] /= num_ckpts + + parser = average_checkpoint.get_parser() + args = parser.parse_args(['--checkpoints', None, + '--begin', '0', + '--end', str(num_ckpts-1), + '--save-path', 'avg.params']) + args.checkpoints = ['update{}.params'.format(i) for i in range(0, num_ckpts)] + average_checkpoint.main(args) + + model.load_parameters('avg.params') + params = model.collect_params() + + for key in gd_avg.keys(): + assert_allclose(gd_avg[key], params[key].data().asnumpy(), 1E-7, 1E-7) From 13a4652ad1ba5fbc357dca50acda9c40f3e3a1b4 Mon Sep 17 00:00:00 2001 From: Hu Date: Mon, 28 Sep 2020 13:51:15 +0800 Subject: [PATCH 3/8] update --- tests/process_cli/test_average_checkpoint.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/process_cli/test_average_checkpoint.py b/tests/process_cli/test_average_checkpoint.py index 092cc4dba0..d81473fcb9 100644 --- a/tests/process_cli/test_average_checkpoint.py +++ b/tests/process_cli/test_average_checkpoint.py @@ -29,11 +29,12 @@ def test_avg_ckpt(): args = parser.parse_args(['--checkpoints', None, '--begin', '0', '--end', str(num_ckpts-1), - '--save-path', 'avg.params']) - args.checkpoints = ['update{}.params'.format(i) for i in range(0, num_ckpts)] + '--save-path', os.path.join(_CURR_DIR, 'avg.params')]) + args.checkpoints = [os.path.join(_CURR_DIR, 'update{}.params'.format(i)) \ + for i in range(0, num_ckpts)] average_checkpoint.main(args) - model.load_parameters('avg.params') + model.load_parameters(os.path.join(_CURR_DIR, 'avg.params')) params = model.collect_params() for key in gd_avg.keys(): From a1e21176e83ab2601eabb38540810fd431c38feb Mon Sep 17 00:00:00 2001 From: Hu Date: Mon, 28 Sep 2020 14:47:38 +0800 Subject: [PATCH 4/8] update --- tests/process_cli/test_average_checkpoint.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/process_cli/test_average_checkpoint.py b/tests/process_cli/test_average_checkpoint.py index d81473fcb9..312bc935f5 100644 --- a/tests/process_cli/test_average_checkpoint.py +++ b/tests/process_cli/test_average_checkpoint.py @@ -30,6 +30,11 @@ def test_avg_ckpt(): '--begin', '0', '--end', str(num_ckpts-1), '--save-path', os.path.join(_CURR_DIR, 'avg.params')]) + args.checkpoints = ['fake', 'ckpt'] + try: + average_checkpoint.main(args) + except: + pass args.checkpoints = [os.path.join(_CURR_DIR, 'update{}.params'.format(i)) \ for i in range(0, num_ckpts)] average_checkpoint.main(args) From dd7896d986b2b2db69cb5b7cf6d840685f9ad14f Mon Sep 17 00:00:00 2001 From: Hu Date: Mon, 28 Sep 2020 14:49:49 +0800 Subject: [PATCH 5/8] update --- scripts/machine_translation/README.md | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/scripts/machine_translation/README.md b/scripts/machine_translation/README.md index 5b487151ea..3e2b853de3 100644 --- a/scripts/machine_translation/README.md +++ b/scripts/machine_translation/README.md @@ -156,17 +156,13 @@ Test BLEU score with 3 seeds (evaluated via sacre BLEU): - transformer_base (test bleu / valid bleu) -| Subword Model | #Params | Seed = 123 | Seed = 1234 | Seed = 12345 | Mean±std | -|---------------|------------|-------------|-------------|--------------|-------------| -| yttm | | 26.78/25.96 | - | - | - | -| hf_bpe | | - | - | - | - | -| spm | | - | - | - | - | +| Subword Model | Seed = 123 | +|---------------|-------------| +| yttm | 26.78/25.96 | - transformer_wmt_en_de_big (test bleu / valid bleu) -| Subword Model | #Params | Seed = 123 | Seed = 1234 | Seed = 12345 | Mean±std | -|---------------|------------|-------------|-------------|--------------|-------------| -| yttm | | 27.99/26.84 | - | - | - | -| hf_bpe | | - | - | - | - | -| spm | | - | - | - | - | +| Subword Model | Seed = 123 | +|---------------|-------------| +| yttm | 27.99/26.84 | From d6f7b92b401601e004f36fb8b3fe0a5e9a2f9b8c Mon Sep 17 00:00:00 2001 From: Hu Date: Mon, 28 Sep 2020 16:21:54 +0800 Subject: [PATCH 6/8] update --- tests/process_cli/test_average_checkpoint.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/process_cli/test_average_checkpoint.py b/tests/process_cli/test_average_checkpoint.py index 312bc935f5..7bb288d0c4 100644 --- a/tests/process_cli/test_average_checkpoint.py +++ b/tests/process_cli/test_average_checkpoint.py @@ -6,6 +6,10 @@ _CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__))) def test_avg_ckpt(): + try: + average_checkpoint.cli_main() + except: + pass num_ckpts = 5 model = nn.Dense(units=10, in_units=10) model.initialize() From 5ba73e916f78633c1fe55c4bae7b3a95693e4911 Mon Sep 17 00:00:00 2001 From: Hu Date: Mon, 28 Sep 2020 20:56:18 +0800 Subject: [PATCH 7/8] update --- scripts/machine_translation/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/machine_translation/README.md b/scripts/machine_translation/README.md index 3e2b853de3..4bafcb920c 100644 --- a/scripts/machine_translation/README.md +++ b/scripts/machine_translation/README.md @@ -40,6 +40,7 @@ python3 train_transformer.py \ ``` Or training via horovod + ``` SUBWORD_ALGO=yttm SRC=en From 7de65b5629af0497c7d48e387182a0899d5f5784 Mon Sep 17 00:00:00 2001 From: Hu Date: Mon, 28 Sep 2020 23:26:15 +0800 Subject: [PATCH 8/8] update --- tests/test_models.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_models.py b/tests/test_models.py index 5df3701a5e..3a41dcf656 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -16,6 +16,9 @@ def test_list_backbone_names(): def test_get_backbone(name, ctx): with tempfile.TemporaryDirectory() as root, ctx: model_cls, cfg, tokenizer, local_params_path, _ = get_backbone(name, root=root) + if name == 'gpt2_1558M': + # skip gpt2 1558M due to the space + return net = model_cls.from_cfg(cfg) net.load_parameters(local_params_path) net.hybridize()