Skip to content

Commit

Permalink
fix bug(#64), update mask, fix typo
Browse files Browse the repository at this point in the history
  • Loading branch information
ShusenTang committed Nov 26, 2019
1 parent c5d0f74 commit a86f344
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 62 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"1.0.0 cpu\n"
"1.2.0 cpu\n"
]
}
],
Expand Down Expand Up @@ -52,9 +52,7 @@
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"# 将一个序列中所有的词记录在all_tokens中以便之后构造词典,然后在该序列后面添加PAD直到序列\n",
Expand All @@ -75,9 +73,7 @@
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"def read_data(max_seq_len):\n",
Expand Down Expand Up @@ -130,9 +126,7 @@
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"class Encoder(nn.Module):\n",
Expand Down Expand Up @@ -183,9 +177,7 @@
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"def attention_model(input_size, attention_size):\n",
Expand All @@ -198,9 +190,7 @@
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"def attention_forward(model, enc_states, dec_state):\n",
Expand Down Expand Up @@ -250,9 +240,7 @@
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"class Decoder(nn.Module):\n",
Expand All @@ -261,8 +249,9 @@
" super(Decoder, self).__init__()\n",
" self.embedding = nn.Embedding(vocab_size, embed_size)\n",
" self.attention = attention_model(2*num_hiddens, attention_size)\n",
" # GRU的输入包含attention输出的c和实际输入, 所以尺寸是 2*embed_size\n",
" self.rnn = nn.GRU(2*embed_size, num_hiddens, num_layers, dropout=drop_prob)\n",
" # GRU的输入包含attention输出的c和实际输入, 所以尺寸是 num_hiddens+embed_size\n",
" self.rnn = nn.GRU(num_hiddens + embed_size, num_hiddens, \n",
" num_layers, dropout=drop_prob)\n",
" self.out = nn.Linear(num_hiddens, vocab_size)\n",
"\n",
" def forward(self, cur_input, state, enc_states):\n",
Expand All @@ -272,8 +261,8 @@
" \"\"\"\n",
" # 使用注意力机制计算背景向量\n",
" c = attention_forward(self.attention, enc_states, state[-1])\n",
" # 将嵌入后的输入和背景向量在特征维连结\n",
" input_and_c = torch.cat((self.embedding(cur_input), c), dim=1) # (批量大小, 2*embed_size)\n",
" # 将嵌入后的输入和背景向量在特征维连结, (批量大小, num_hiddens+embed_size)\n",
" input_and_c = torch.cat((self.embedding(cur_input), c), dim=1) \n",
" # 为输入和背景向量的连结增加时间步维,时间步个数为1\n",
" output, state = self.rnn(input_and_c.unsqueeze(0), state)\n",
" # 移除时间步维,输出形状为(批量大小, 输出词典大小)\n",
Expand All @@ -295,9 +284,7 @@
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"def batch_loss(encoder, decoder, X, Y, loss):\n",
Expand All @@ -308,25 +295,23 @@
" dec_state = decoder.begin_state(enc_state)\n",
" # 解码器在最初时间步的输入是BOS\n",
" dec_input = torch.tensor([out_vocab.stoi[BOS]] * batch_size)\n",
" # 我们将使用掩码变量mask来忽略掉标签为填充项PAD的损失\n",
" # 我们将使用掩码变量mask来忽略掉标签为填充项PAD的损失, 初始全1\n",
" mask, num_not_pad_tokens = torch.ones(batch_size,), 0\n",
" l = torch.tensor([0.0])\n",
" for y in Y.permute(1,0): # Y shape: (batch, seq_len)\n",
" dec_output, dec_state = decoder(dec_input, dec_state, enc_outputs)\n",
" l = l + (mask * loss(dec_output, y)).sum()\n",
" dec_input = y # 使用强制教学\n",
" num_not_pad_tokens += mask.sum().item()\n",
" # 将PAD对应位置的掩码设成0, 原文这里是 y != out_vocab.stoi[EOS], 感觉有误\n",
" mask = mask * (y != out_vocab.stoi[PAD]).float()\n",
" # EOS后面全是PAD. 下面一行保证一旦遇到EOS接下来的循环中mask就一直是0\n",
" mask = mask * (y != out_vocab.stoi[EOS]).float()\n",
" return l / num_not_pad_tokens"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"def train(encoder, decoder, dataset, lr, batch_size, num_epochs):\n",
Expand Down Expand Up @@ -358,11 +343,11 @@
"name": "stdout",
"output_type": "stream",
"text": [
"epoch 10, loss 0.441\n",
"epoch 20, loss 0.183\n",
"epoch 30, loss 0.100\n",
"epoch 40, loss 0.046\n",
"epoch 50, loss 0.025\n"
"epoch 10, loss 0.475\n",
"epoch 20, loss 0.245\n",
"epoch 30, loss 0.157\n",
"epoch 40, loss 0.052\n",
"epoch 50, loss 0.039\n"
]
}
],
Expand All @@ -386,9 +371,7 @@
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"def translate(encoder, decoder, input_seq, max_seq_len):\n",
Expand Down Expand Up @@ -443,9 +426,7 @@
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"def bleu(pred_tokens, label_tokens, k):\n",
Expand All @@ -466,9 +447,7 @@
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"def score(input_seq, label_seq, k):\n",
Expand Down Expand Up @@ -504,29 +483,27 @@
"name": "stdout",
"output_type": "stream",
"text": [
"bleu 0.658, predict: they are russian .\n"
"bleu 0.658, predict: they are exhausted .\n"
]
}
],
"source": [
"score('ils sont canadiens .', 'they are canadian .', k=2)"
"score('ils sont canadienne .', 'they are canadian .', k=2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:anaconda3]",
"display_name": "Python [conda env:py36]",
"language": "python",
"name": "conda-env-anaconda3-py"
"name": "conda-env-py36-py"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -538,7 +515,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
"version": "3.6.2"
}
},
"nbformat": 4,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,9 @@ class Decoder(nn.Module):
super(Decoder, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_size)
self.attention = attention_model(2*num_hiddens, attention_size)
# GRU的输入包含attention输出的c和实际输入, 所以尺寸是 2*embed_size
self.rnn = nn.GRU(2*embed_size, num_hiddens, num_layers, dropout=drop_prob)
# GRU的输入包含attention输出的c和实际输入, 所以尺寸是 num_hiddens+embed_size
self.rnn = nn.GRU(num_hiddens + embed_size, num_hiddens,
num_layers, dropout=drop_prob)
self.out = nn.Linear(num_hiddens, vocab_size)

def forward(self, cur_input, state, enc_states):
Expand All @@ -176,8 +177,8 @@ class Decoder(nn.Module):
"""
# 使用注意力机制计算背景向量
c = attention_forward(self.attention, enc_states, state[-1])
# 将嵌入后的输入和背景向量在特征维连结
input_and_c = torch.cat((self.embedding(cur_input), c), dim=1) # (批量大小, 2*embed_size)
# 将嵌入后的输入和背景向量在特征维连结, (批量大小, num_hiddens+embed_size)
input_and_c = torch.cat((self.embedding(cur_input), c), dim=1)
# 为输入和背景向量的连结增加时间步维,时间步个数为1
output, state = self.rnn(input_and_c.unsqueeze(0), state)
# 移除时间步维,输出形状为(批量大小, 输出词典大小)
Expand Down Expand Up @@ -210,8 +211,8 @@ def batch_loss(encoder, decoder, X, Y, loss):
l = l + (mask * loss(dec_output, y)).sum()
dec_input = y # 使用强制教学
num_not_pad_tokens += mask.sum().item()
# 将PAD对应位置的掩码设成0, 原文这里是 y != out_vocab.stoi[EOS], 感觉有误
mask = mask * (y != out_vocab.stoi[PAD]).float()
# EOS后面全是PAD. 下面一行保证一旦遇到EOS接下来的循环中mask就一直是0
mask = mask * (y != out_vocab.stoi[EOS]).float()
return l / num_not_pad_tokens
```

Expand Down Expand Up @@ -299,7 +300,7 @@ translate(encoder, decoder, input_seq, max_seq_len)

评价机器翻译结果通常使用BLEU(Bilingual Evaluation Understudy)[1]。对于模型预测序列中任意的子序列,BLEU考察这个子序列是否出现在标签序列中。

具体来说,设词数为$n$的子序列的精度为$p_n$。它是预测序列与标签序列匹配词数为$n$的子序列的数量与预测序列中词数为$n$的子序列的数量之比。举个例子,假设标签序列为$A$、$B$、$C$、$D$、$E$、$F$,预测序列为$A$、$B$、$B$、$C$、$D$,那么$p_1 = 4/5,\ p_2 = 3/4,\ p_3 = 1/3,\ p_4 = 0$。设$len_{\text{label}}$和$len_{\text{pred}}$分别为标签序列和预测序列的词数,那么,BLEU的定义为
具体来说,设词数为$n$的子序列的精度为$p_n$。它是预测序列与标签序列匹配词数为$n$的子序列的数量与预测序列中词数为$n$的子序列的数量之比。举个例子,假设标签序列为$A$、$B$、$C$、$D$、$E$、$F$,预测序列为$A$、$B$、$B$、$C$、$D$,那么$p_1 = 4/5, p_2 = 3/4, p_3 = 1/3, p_4 = 0$。设$len_{\text{label}}$和$len_{\text{pred}}$分别为标签序列和预测序列的词数,那么,BLEU的定义为

$$ \exp\left(\min\left(0, 1 - \frac{len_{\text{label}}}{len_{\text{pred}}}\right)\right) \prod_{n=1}^k p_n^{1/2^n},$$

Expand Down Expand Up @@ -348,7 +349,7 @@ bleu 1.000, predict: they are watching .
测试一个不在训练集中的样本。

``` python
score('ils sont canadiens .', 'they are canadian .', k=2)
score('ils sont canadienne .', 'they are canadian .', k=2)
```
输出:
```
Expand Down

0 comments on commit a86f344

Please sign in to comment.