onnx bi-transformer (facebookresearch#385)

Summary: Pull Request resolved: facebookresearch#385 Pull Request resolved: facebookresearch/pytext#6 Pull Request resolved: pytorch/pytorch#14292 Reviewed By: jingfeidu Differential Revision: D10517864 fbshipit-source-id: 81008b5cc6aab70e23329c187392fb72ee057d78
mpatwary · Nov 27, 2018 · a5e2d78 · a5e2d78
1 parent 14506a8
commit a5e2d78
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 7 deletions.
diff --git a/fairseq/modules/highway.py b/fairseq/modules/highway.py
@@ -51,5 +51,5 @@ def forward(
             proj_x, gate = projection.chunk(2, dim=-1)
             proj_x = self.activation(proj_x)
             gate = F.sigmoid(gate)
-            x = gate * x + (1 - gate) * proj_x
+            x = gate * x + (gate.new_tensor([1]) - gate) * proj_x
         return x
diff --git a/fairseq/modules/multihead_attention.py b/fairseq/modules/multihead_attention.py
@@ -156,20 +156,32 @@ def forward(self, query, key, value, key_padding_mask=None, incremental_state=No
             if attn_mask is not None:
                 attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1)
             if key_padding_mask is not None:
-                key_padding_mask = torch.cat([key_padding_mask, key_padding_mask.new_zeros(key_padding_mask.size(0), 1)], dim=1)
+                key_padding_mask = torch.cat(
+                    [key_padding_mask, torch.zeros(key_padding_mask.size(0), 1).type_as(key_padding_mask)], dim=1)
 
         attn_weights = torch.bmm(q, k.transpose(1, 2))
         assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]
 
         if attn_mask is not None:
-            attn_weights += attn_mask.unsqueeze(0)
+            attn_mask = attn_mask.unsqueeze(0)
+            if self.onnx_trace:
+                attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1)
+            attn_weights += attn_mask
+
         if key_padding_mask is not None:
             # don't attend to padding symbols
             attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights.float().masked_fill(
-                key_padding_mask.unsqueeze(1).unsqueeze(2),
-                float('-inf'),
-            ).type_as(attn_weights)  # FP16 support: cast to float and back
+            if self.onnx_trace:
+                attn_weights = torch.where(
+                    key_padding_mask.unsqueeze(1).unsqueeze(2),
+                    torch.Tensor([float("-Inf")]),
+                    attn_weights.float()
+                ).type_as(attn_weights)
+            else:
+                attn_weights = attn_weights.float().masked_fill(
+                    key_padding_mask.unsqueeze(1).unsqueeze(2),
+                    float('-inf'),
+                ).type_as(attn_weights)  # FP16 support: cast to float and back
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
 
         attn_weights = F.softmax(attn_weights.float(), dim=-1).type_as(attn_weights)