OpenNMT · funboarder13920 · Feb 19, 2021 · Feb 19, 2021 · Feb 19, 2021 · Feb 19, 2021
diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
@@ -148,6 +148,38 @@ jobs:
           -word_vec_size 16 -report_every 5        \
           -rnn_size 16 -train_steps 10 \
           -copy_attn
+    - name: Test LM training with label smoothing
+      run: |
+        python train.py \
+            -config data/lm_data.yaml \
+            -src_vocab /tmp/onmt.vocab.src \
+            -tgt_vocab /tmp/onmt.vocab.src \
+            -model_task lm \
+            -encoder_type transformer_lm \
+            -decoder_type transformer_lm \
+            -src_vocab_size 1000 \
+            -tgt_vocab_size 1000 \
+            -label_smoothing 0.1 \
+            -dec_layers 2 -batch_size 10 \
+            -heads 4 -transformer_ff 64 \
+            -word_vec_size 16 -report_every 5 \
+            -rnn_size 16 -train_steps 10
+    - name: Test LM training with unlikelihood loss
+      run: |
+        python train.py \
+            -config data/lm_data.yaml \
+            -src_vocab /tmp/onmt.vocab.src \
+            -tgt_vocab /tmp/onmt.vocab.src \
+            -model_task lm \
+            -encoder_type transformer_lm \
+            -decoder_type transformer_lm \
+            -src_vocab_size 1000 \
+            -tgt_vocab_size 1000 \
+            -unlikelihood_coeff 1.0 \
+            -dec_layers 2 -batch_size 10 \
+            -heads 4 -transformer_ff 64 \
+            -word_vec_size 16 -report_every 5 \
+            -rnn_size 16 -train_steps 10
     - name: Test Graph neural network training
       run: |
         python train.py \

diff --git a/onmt/modules/copy_generator.py b/onmt/modules/copy_generator.py
@@ -2,7 +2,7 @@
 import torch.nn as nn
 
 from onmt.utils.misc import aeq
-from onmt.utils.loss import CommonLossCompute
+from onmt.utils.loss import LossComputeBase
 
 
 def collapse_copy_scores(scores, batch, tgt_vocab, src_vocabs=None,
@@ -177,7 +177,7 @@ def forward(self, scores, align, target):
         return loss
 
 
-class CommonCopyGeneratorLossCompute(CommonLossCompute):
+class CommonCopyGeneratorLossCompute(LossComputeBase):
 def __init__(self, criterion, generator, normalization="sents", 
 def __init__(self, criterion, generator, normalization="sents", 
     """Common Copy Generator Loss Computation."""
     def __init__(self, criterion, generator, tgt_vocab, normalize_by_length,
                  lambda_coverage=0.0, tgt_shift_index=1):
@@ -231,7 +231,8 @@ def _compute_loss(self, batch, output, target, copy_attn, align,
         target_data[correct_mask] += offset_align
 
         # Compute sum of perplexities for stats
-        stats = self._stats(loss.sum().clone(), scores_data, target_data)
+        stats = self._stats(loss.sum().clone(), loss.sum().clone(),
+                            scores_data, target_data)
 
         # this part looks like it belongs in CopyGeneratorLoss
         if self.normalize_by_length:

diff --git a/onmt/modules/sparse_losses.py b/onmt/modules/sparse_losses.py
@@ -77,3 +77,9 @@ def forward(self, input, target):
         elif self.reduction == 'elementwise_mean':
             loss = loss.sum() / size
         return loss
+
+
+class ExpandedSparsemaxLoss(SparsemaxLoss):
+    def forward(self, input, target):
+        gtruth = target.view(-1)
+        return super(ExpandedSparsemaxLoss, self).forward(input, gtruth)
diff --git a/onmt/opts.py b/onmt/opts.py
@@ -574,13 +574,23 @@ def _add_train_general_opts(parser):
                    'suggested a value of 0.98 for beta2, this parameter may '
                    'not work well for normal models / default '
                    'baselines.')
-    group.add('--label_smoothing', '-label_smoothing', type=float, default=0.0,
-              help="Label smoothing value epsilon. "
-                   "Probabilities of all non-true labels "
-                   "will be smoothed by epsilon / (vocab_size - 1). "
-                   "Set to zero to turn off label smoothing. "
-                   "For more detailed information, see: "
-                   "https://arxiv.org/abs/1512.00567")
+    subgroup = group.add_mutually_exclusive_group()
+    subgroup.add('--label_smoothing', '-label_smoothing', type=float,
+                 default=0.0,
+                 help="Label smoothing value epsilon. "
+                      "Probabilities of all non-true labels "
+                      "will be smoothed by epsilon / (vocab_size - 1). "
+                      "Set to zero to turn off label smoothing. "
+                      "For more detailed information, see: "
+                      "https://arxiv.org/abs/1512.00567")
+    subgroup.add('--unlikelihood_coeff', '-unlikelihood_coeff', type=float,
+                 default=0.0,
+                 help="Loss coefficient for token unlikelihood loss. "
+                      "Usually set to 1. max_generator_batches option will "
+                      "limit the neighbourhood size of the unlikelihood loss."
+                      " For more detailed information, see: "
+                      "https://arxiv.org/abs/1908.04319 and "
+                      "https://openreview.net/forum?id=SJeYe0NtvH")
     group.add('--average_decay', '-average_decay', type=float, default=0,
               help="Moving average decay. "
                    "Set to other than 0 (e.g. 1e-4) to activate. "

diff --git a/onmt/tests/pull_request_chk.sh b/onmt/tests/pull_request_chk.sh
@@ -179,6 +179,42 @@ ${PYTHON} onmt/bin/train.py \
             -rnn_size 16 -train_steps 10 \
             -copy_attn >> ${LOG_FILE} 2>&1
 [ "$?" -eq 0 ] || error_exit
+echo "Succeeded" | tee -a ${LOG_FILE}
+
+echo -n "  [+] Testing LM training with label smoothing..."
+${PYTHON} onmt/bin/train.py \
+            -config ${DATA_DIR}/lm_data.yaml \
+            -src_vocab $TMP_OUT_DIR/onmt.vocab.src \
+            -tgt_vocab $TMP_OUT_DIR/onmt.vocab.src \
+            -model_task lm \
+            -encoder_type transformer_lm \
+            -decoder_type transformer_lm \
+            -src_vocab_size 1000 \
+            -tgt_vocab_size 1000 \
+            -label_smoothing 0.1 \
+            -dec_layers 2 -batch_size 10 \
+            -heads 4 -transformer_ff 64 \
+            -word_vec_size 16 -report_every 5        \
+            -rnn_size 16 -train_steps 10 >> ${LOG_FILE} 2>&1
+[ "$?" -eq 0 ] || error_exit
+echo "Succeeded" | tee -a ${LOG_FILE}
+
+echo -n "  [+] Testing LM training with unlikelihood loss..."
+${PYTHON} onmt/bin/train.py \
+            -config ${DATA_DIR}/lm_data.yaml \
+            -src_vocab $TMP_OUT_DIR/onmt.vocab.src \
+            -tgt_vocab $TMP_OUT_DIR/onmt.vocab.src \
+            -model_task lm \
+            -encoder_type transformer_lm \
+            -decoder_type transformer_lm \
+            -src_vocab_size 1000 \
+            -tgt_vocab_size 1000 \
+            -unlikelihood_coeff 1 \
+            -dec_layers 2 -batch_size 10 \
+            -heads 4 -transformer_ff 64 \
+            -word_vec_size 16 -report_every 5        \
+            -rnn_size 16 -train_steps 10 >> ${LOG_FILE} 2>&1
+[ "$?" -eq 0 ] || error_exit
 echo "Succeeded" | tee -a ${LOG_FILE}*
 rm $TMP_OUT_DIR/onmt.vocab*
 

diff --git a/onmt/tests/test_unlikelihood_loss_criterion.py b/onmt/tests/test_unlikelihood_loss_criterion.py
@@ -0,0 +1,78 @@
+import unittest
+from onmt.utils.loss import UnlikelihoodTokenLoss
+import torch
+import math
+
+
+class TestUnlikelihoodLossCriterion(unittest.TestCase):
+    def test_compute_previous_context_tokens(self):
+        criterion = UnlikelihoodTokenLoss(1, 7)
+        target = torch.tensor([[2, 3, 4, 3, 5], [1, 1, 5, 6, 7]]).permute(1, 0)
+        previous_context_tokens = criterion.compute_previous_context_tokens(
+            target
+        )
+
+        self.assertEqual(
+            previous_context_tokens.permute(1, 0, 2).tolist(),
+            torch.tensor(
+                [
+                    [
+                        [7, 7, 7, 7, 7],
+                        [2, 7, 7, 7, 7],
+                        [2, 3, 7, 7, 7],
+                        [2, 7, 4, 7, 7],
+                        [2, 3, 4, 3, 7],
+                    ],
+                    [
+                        [7, 7, 7, 7, 7],
+                        [7, 7, 7, 7, 7],
+                        [1, 1, 7, 7, 7],
+                        [1, 1, 5, 7, 7],
+                        [7, 7, 7, 7, 7],
+                    ],
+                ]
+            ).tolist(),
+        )
+
+    def test_loss_perfect_pred_should_be_zero(self):
+        criterion = UnlikelihoodTokenLoss(1, 7)
+        n_prob = -10e6
+        target = torch.tensor([[2, 3, 4, 3, 5], [1, 1, 5, 6, 7]]).permute(1, 0)
+        perfect_probs = [
+            [[n_prob if i != t else 1 for i in range(8)] for t in ex_target]
+            for ex_target in target
+        ]
+
+        # check padded seq is removed
+        perfect_probs[-1][-1][-1] = n_prob
+        perfect_probs[-1][-1][1] = 0.1
+
+        output = torch.tensor(perfect_probs).view(-1, 8)
+
+        unlikelihood_loss = criterion.compute_unlikelihood_loss(output, target)
+
+        self.assertEqual(unlikelihood_loss.sum().item(), 0)
+
+    def test_loss_value(self):
+        criterion = UnlikelihoodTokenLoss(1, 7)
+        n_prob = -10e6
+        target = torch.tensor([[2, 3, 4, 3, 5], [1, 1, 5, 6, 7]]).permute(1, 0)
+        perfect_probs = [
+            [[n_prob if i != t else 1 for i in range(8)] for t in ex_target]
+            for ex_target in target
+        ]
+
+        # check padded seq is removed
+        perfect_probs[-1][-1][-1] = n_prob
+        perfect_probs[-1][-1][1] = 0.1
+
+        # set prob at 0.5 on 1 after softmax
+        perfect_probs[2][-1][1] = 1
+
+        output = torch.tensor(perfect_probs).view(-1, 8)
+
+        unlikelihood_loss = criterion.compute_unlikelihood_loss(output, target)
+
+        self.assertAlmostEqual(
+            unlikelihood_loss.view(5, 2, 8)[2, -1, 1].item(), -math.log(0.5)
+        )