From 7ee40129a20b5681dde3654367827a59a7a5cdab Mon Sep 17 00:00:00 2001 From: Joost Bastings Date: Sat, 24 Jun 2017 14:11:49 +0200 Subject: [PATCH 1/2] Only clean training data, not dev/test The data preparation script cleans also dev/test data which should not be done --- bin/data/wmt16_en_de.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/data/wmt16_en_de.sh b/bin/data/wmt16_en_de.sh index 0107ce63..87c31e4f 100755 --- a/bin/data/wmt16_en_de.sh +++ b/bin/data/wmt16_en_de.sh @@ -117,8 +117,8 @@ for f in ${OUTPUT_DIR}/*.en; do ${OUTPUT_DIR}/mosesdecoder/scripts/tokenizer/tokenizer.perl -q -l en -threads 8 < $f > ${f%.*}.tok.en done -# Clean all corpora -for f in ${OUTPUT_DIR}/*.en; do +# Clean training corpus +for f in ${OUTPUT_DIR}/train.en; do fbase=${f%.*} echo "Cleaning ${fbase}..." ${OUTPUT_DIR}/mosesdecoder/scripts/training/clean-corpus-n.perl $fbase de en "${fbase}.clean" 1 80 From adcf16a53e5d6211e89e42987374e7042536789b Mon Sep 17 00:00:00 2001 From: Joost Bastings Date: Sat, 24 Jun 2017 16:00:10 +0200 Subject: [PATCH 2/2] Clean train.tok.* not train.* --- bin/data/wmt16_en_de.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/data/wmt16_en_de.sh b/bin/data/wmt16_en_de.sh index 87c31e4f..3f78c8c2 100755 --- a/bin/data/wmt16_en_de.sh +++ b/bin/data/wmt16_en_de.sh @@ -118,7 +118,7 @@ for f in ${OUTPUT_DIR}/*.en; do done # Clean training corpus -for f in ${OUTPUT_DIR}/train.en; do +for f in ${OUTPUT_DIR}/train.tok.en; do fbase=${f%.*} echo "Cleaning ${fbase}..." ${OUTPUT_DIR}/mosesdecoder/scripts/training/clean-corpus-n.perl $fbase de en "${fbase}.clean" 1 80