dmlc · sxjscience · Sep 8, 2020 · Sep 6, 2020 · Sep 7, 2020 · Sep 7, 2020
@@ -614,8 +614,9 @@ def main(args):
     if args.data_dir is None:
         args.data_dir = args.benchmark
     args.cache_path = os.path.join(args.cache_path, args.benchmark)
-    print('Downloading {} to {}. Selected tasks = {}'.format(args.benchmark,
-                                                             args.data_dir, args.tasks))
+    print('Downloading {} to "{}". Selected tasks = {}'.format(args.benchmark,
+                                                               args.data_dir,
+                                                               args.tasks))
     os.makedirs(args.cache_path, exist_ok=True)
     os.makedirs(args.data_dir, exist_ok=True)
     tasks = get_tasks(args.benchmark, args.tasks)

@@ -33,36 +33,42 @@
 
 
 @use_np
-def get_layer_norm(normalization: str = 'layer_norm',
+def get_norm_layer(normalization: str = 'layer_norm',
                    axis: int = -1,
                    epsilon: float = 1e-5,
                    in_channels: int = 0, **kwargs):
     """
-    Get the layer normalization based on the type
+    Get the normalization layer based on the type
 
     Parameters
     ----------
-    normalization: str, default: 'layer_norm'
-        The type of the layer normalization from ['layer_norm', 'no_norm']
+    normalization
+        The type of the layer normalization from ['layer_norm', 'no_norm', 'batch_norm']
     axis
         The axis to normalize the
     epsilon
+        The epsilon of the normalization layer
     in_channels
+        Input channel
 
     Returns
     -------
-    ln
+    norm_layer
         The layer normalization layer
     """
     if isinstance(normalization, str):
         if normalization == 'layer_norm':
-            ln = nn.LayerNorm(axis=axis, epsilon=epsilon, in_channels=in_channels,
-                              **kwargs)
+            norm_layer = nn.LayerNorm(axis=axis, epsilon=epsilon, in_channels=in_channels,
+                                      **kwargs)
         elif normalization == 'no_norm':
-            ln = NoNorm(in_channels=in_channels, **kwargs)
+            norm_layer = NoNorm(in_channels=in_channels, **kwargs)
+        elif normalization == 'identity':
+            norm_layer = IdentityActivation()
+        elif normalization == 'batch_norm':
+            norm_layer = nn.BatchNorm(axis=axis, epsilon=epsilon, in_channels=in_channels, **kwargs)
         else:
             raise NotImplementedError('normalization={} is not supported'.format(normalization))
-        return ln
+        return norm_layer
     else:
         raise NotImplementedError('The type of normalization must be str')
 
@@ -629,7 +635,7 @@ def __init__(self,
                               bias_initializer=bias_initializer,
                               dtype=dtype)
         # TODO(sxjscience) We may need to set the dtype flag in LayerNorm, need to double check
-        self.layer_norm = get_layer_norm(normalization=normalization,
+        self.layer_norm = get_norm_layer(normalization=normalization,
                                          in_channels=units,
                                          epsilon=layer_norm_eps)
 

@@ -4,6 +4,7 @@
 from . import albert
 from . import bert
 from . import electra
+from . import gpt2
 from . import mobilebert
 from . import roberta
 from . import transformer

@@ -336,10 +336,12 @@ def __init__(self,
                                        dtype=dtype)
         if embed_size != units:
             self.embed_factorized_proj = nn.Dense(units=units,
+                                                  in_units=embed_size,
                                                   flatten=False,
                                                   weight_initializer=weight_initializer,
                                                   bias_initializer=bias_initializer)
-        self.embed_layer_norm = nn.LayerNorm(epsilon=self.layer_norm_eps)
+        self.embed_layer_norm = nn.LayerNorm(epsilon=self.layer_norm_eps,
+                                             in_channels=embed_size)
         self.embed_dropout = nn.Dropout(hidden_dropout_prob)
         # Construct token type embedding
         self.token_type_embed = nn.Embedding(input_dim=num_token_types,
@@ -561,15 +563,18 @@ def __init__(self, backbone_cfg,
         self.mlm_decoder = nn.HybridSequential()
         # Extra non-linear layer
         self.mlm_decoder.add(nn.Dense(units=self.backbone_model.embed_size,
+                                      in_units=self.backbone_model.units,
                                       flatten=False,
                                       weight_initializer=weight_initializer,
                                       bias_initializer=bias_initializer))
         self.mlm_decoder.add(get_activation(self.backbone_model.activation))
-        self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps))
+        self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps,
+                                          in_channels=self.backbone_model.embed_size))
         # only load the dense weights with a re-initialized bias
         # parameters are stored in 'word_embed_bias' which is
         # not used in original embedding
         self.mlm_decoder.add(nn.Dense(units=self.backbone_model.vocab_size,
+                                      in_units=self.backbone_model.embed_size,
                                       flatten=False,
                                       bias_initializer=bias_initializer))
         self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight
@@ -649,19 +654,23 @@ def __init__(self, backbone_cfg,
             bias_initializer = self.backbone_model.bias_initializer
         # Construct sop_classifier for sentence order prediction
         self.sop_classifier = nn.Dense(units=2,
+                                       in_units=self.backbone_model.units,
                                        weight_initializer=weight_initializer)
         self.mlm_decoder = nn.HybridSequential()
         # Extra non-linear layer
         self.mlm_decoder.add(nn.Dense(units=self.backbone_model.embed_size,
+                                      in_units=self.backbone_model.units,
                                       flatten=False,
                                       weight_initializer=weight_initializer,
                                       bias_initializer=bias_initializer))
         self.mlm_decoder.add(get_activation(self.backbone_model.activation))
-        self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps))
+        self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps,
+                                          in_channels=self.backbone_model.embed_size))
         # only load the dense weights with a re-initialized bias
         # parameters are stored in 'word_embed_bias' which is
         # not used in original embedding
         self.mlm_decoder.add(nn.Dense(units=self.backbone_model.vocab_size,
+                                      in_units=self.backbone_model.embed_size,
                                       flatten=False,
                                       bias_initializer=bias_initializer))
         self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight

@@ -171,13 +171,16 @@ def __init__(self,
         if not extract_feature:
             if self.tie_weights:
                 self.tgt_final_layer = \
-                    nn.Dense(self._tgt_vocab_size, flatten=False,
+                    nn.Dense(units=self._tgt_vocab_size,
+                             in_units=self.dec_units,
+                             flatten=False,
                              use_bias=False,
                              dtype=self._dtype)
                 self.tgt_final_layer.weight = self.tgt_embed_layer.weight
             else:
                 self.tgt_final_layer = \
-                    nn.Dense(self._tgt_vocab_size,
+                    nn.Dense(units=self._tgt_vocab_size,
+                             in_units=self.dec_units,
                              flatten=False,
                              weight_initializer=self.weight_initializer,
                              use_bias=False,

@@ -370,7 +370,8 @@ def __init__(self,
                                        output_dim=units,
                                        weight_initializer=embed_initializer,
                                        dtype=dtype)
-        self.embed_layer_norm = nn.LayerNorm(epsilon=self.layer_norm_eps)
+        self.embed_layer_norm = nn.LayerNorm(epsilon=self.layer_norm_eps,
+                                             in_channels=units)
         self.embed_dropout = nn.Dropout(hidden_dropout_prob)
         # Construct token type embedding
         self.token_type_embed = nn.Embedding(input_dim=num_token_types,
@@ -585,15 +586,18 @@ def __init__(self, backbone_cfg,
         self.mlm_decoder = nn.HybridSequential()
         # Extra non-linear layer
         self.mlm_decoder.add(nn.Dense(units=self.backbone_model.units,
+                                      in_units=self.backbone_model.units,
                                       flatten=False,
                                       weight_initializer=weight_initializer,
                                       bias_initializer=bias_initializer))
         self.mlm_decoder.add(get_activation(self.backbone_model.activation))
-        self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps))
+        self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps,
+                                          in_channels=self.backbone_model.units))
         # only load the dense weights with a re-initialized bias
         # parameters are stored in 'word_embed_bias' which is
         # not used in original embedding
         self.mlm_decoder.add(nn.Dense(units=self.backbone_model.vocab_size,
+                                      in_units=self.backbone_model.units,
                                       flatten=False,
                                       bias_initializer=bias_initializer))
         self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight
@@ -674,19 +678,23 @@ def __init__(self, backbone_cfg,
             bias_initializer = self.backbone_model.bias_initializer
         # Construct nsp_classifier for next sentence prediction
         self.nsp_classifier = nn.Dense(units=2,
+                                       in_units=self.backbone_model.units,
                                        weight_initializer=weight_initializer)
         self.mlm_decoder = nn.HybridSequential()
         # Extra non-linear layer
         self.mlm_decoder.add(nn.Dense(units=self.backbone_model.units,
+                                      in_units=self.backbone_model.units,
                                       flatten=False,
                                       weight_initializer=weight_initializer,
                                       bias_initializer=bias_initializer))
         self.mlm_decoder.add(get_activation(self.backbone_model.activation))
-        self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps))
+        self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps,
+                                          in_channels=self.backbone_model.units))
         # only load the dense weights with a re-initialized bias
         # parameters are stored in 'word_embed_bias' which is
         # not used in original embedding
         self.mlm_decoder.add(nn.Dense(units=self.backbone_model.vocab_size,
+                                      in_units=self.backbone_model.units,
                                       flatten=False,
                                       bias_initializer=bias_initializer))
         self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight

@@ -29,7 +29,7 @@
            'ElectraForPretrain', 'list_pretrained_electra', 'get_pretrained_electra']
 
 import os
-from typing import Tuple, Optional
+from typing import Tuple, Optional, List
 
 import mxnet as mx
 import numpy as np
@@ -388,11 +388,13 @@ def __init__(self,
                                                    max_length=max_length,
                                                    dtype=self._dtype,
                                                    method=pos_embed_type)
-        self.embed_layer_norm = nn.LayerNorm(epsilon=self.layer_norm_eps)
+        self.embed_layer_norm = nn.LayerNorm(epsilon=self.layer_norm_eps,
+                                             in_channels=embed_size)
 
         self.embed_dropout = nn.Dropout(hidden_dropout_prob)
         if embed_size != units:
             self.embed_factorized_proj = nn.Dense(units=units,
+                                                  in_units=embed_size,
                                                   flatten=False,
                                                   weight_initializer=weight_initializer,
                                                   bias_initializer=bias_initializer)
@@ -509,25 +511,30 @@ def get_initial_embedding(self, F, inputs, token_types=None):
         embedding = self.embed_dropout(embedding)
         return embedding
 
-    def apply_layerwise_decay(self, layerwise_decay, not_included=None):
+    def apply_layerwise_decay(self, layerwise_decay: int,
+                              not_included: Optional[List[str]] = None,
+                              num_additional_layers: int = 2):
         """Apply the layer-wise gradient decay
 
         .. math::
             lr = lr * layerwise_decay^(max_depth - layer_depth)
 
         Parameters:
         ----------
-        layerwise_decay: int
-            layer-wise decay power
-        not_included: list of str
+        layerwise_decay
+            Power rate of the layer-wise decay
+        not_included
             A list or parameter names that not included in the layer-wise decay
+        num_additional_layers
+            The number of layers after the current backbone. This helps determine the max depth
         """
 
-        # consider the task specific finetuning layer as the last layer, following with pooler
-        # In addition, the embedding parameters have the smaller learning rate based on this setting.
-        max_depth = self.num_layers + 2
+        # Consider the task specific finetuning layer as the last layer, following with pooler
+        # In addition, the embedding parameters have the smaller learning rate based on this
+        # setting.
+        max_depth = self.num_layers + num_additional_layers
         for _, value in self.collect_params('.*embed*').items():
-            value.lr_mult = layerwise_decay**(max_depth)
+            value.lr_mult = layerwise_decay ** max_depth
 
         for (layer_depth, layer) in enumerate(self.encoder.all_encoder_layers):
             layer_params = layer.collect_params()
@@ -630,11 +637,13 @@ def __init__(self, backbone_cfg,
         self.rtd_encoder = nn.HybridSequential()
         # Extra non-linear layer
         self.rtd_encoder.add(nn.Dense(units=self.backbone_model.units,
+                                      in_units=self.backbone_model.units,
                                       flatten=False,
                                       weight_initializer=weight_initializer,
                                       bias_initializer=bias_initializer))
         self.rtd_encoder.add(get_activation(self.backbone_model.activation))
         self.rtd_encoder.add(nn.Dense(units=1,
+                                      in_units=self.backbone_model.units,
                                       flatten=False,
                                       weight_initializer=weight_initializer,
                                       bias_initializer=bias_initializer))
@@ -711,17 +720,20 @@ def __init__(self, backbone_cfg,
         self.mlm_decoder = nn.HybridSequential()
         # Extra non-linear layer
         self.mlm_decoder.add(nn.Dense(units=self.backbone_model.embed_size,
+                                      in_units=self.backbone_model.units,
                                       flatten=False,
                                       weight_initializer=weight_initializer,
                                       bias_initializer=bias_initializer))
         self.mlm_decoder.add(get_activation(self.backbone_model.activation))
-        self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps))
+        self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps,
+                                          in_channels=self.backbone_model.units))
         # only load the dense weights with a re-initialized bias
         # parameters are stored in 'word_embed_bias' which is
         # not used in original embedding
         self.mlm_decoder.add(
             nn.Dense(
                 units=self.backbone_model.vocab_size,
+                in_units=self.backbone_model.units,
                 flatten=False,
                 bias_initializer=bias_initializer))
         self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight