Skip to content
This repository has been archived by the owner on Jan 15, 2024. It is now read-only.

[Fix] Some minor fixes for AMLC Tutorial #1355

Merged
merged 16 commits into from
Sep 8, 2020
5 changes: 3 additions & 2 deletions scripts/datasets/general_nlp_benchmark/prepare_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -614,8 +614,9 @@ def main(args):
if args.data_dir is None:
args.data_dir = args.benchmark
args.cache_path = os.path.join(args.cache_path, args.benchmark)
print('Downloading {} to {}. Selected tasks = {}'.format(args.benchmark,
args.data_dir, args.tasks))
print('Downloading {} to "{}". Selected tasks = {}'.format(args.benchmark,
args.data_dir,
args.tasks))
os.makedirs(args.cache_path, exist_ok=True)
os.makedirs(args.data_dir, exist_ok=True)
tasks = get_tasks(args.benchmark, args.tasks)
Expand Down
26 changes: 16 additions & 10 deletions src/gluonnlp/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,36 +33,42 @@


@use_np
def get_layer_norm(normalization: str = 'layer_norm',
def get_norm_layer(normalization: str = 'layer_norm',
axis: int = -1,
epsilon: float = 1e-5,
in_channels: int = 0, **kwargs):
"""
Get the layer normalization based on the type
Get the normalization layer based on the type

Parameters
----------
normalization: str, default: 'layer_norm'
The type of the layer normalization from ['layer_norm', 'no_norm']
normalization
The type of the layer normalization from ['layer_norm', 'no_norm', 'batch_norm']
axis
The axis to normalize the
epsilon
The epsilon of the normalization layer
in_channels
Input channel

Returns
-------
ln
norm_layer
The layer normalization layer
"""
if isinstance(normalization, str):
if normalization == 'layer_norm':
ln = nn.LayerNorm(axis=axis, epsilon=epsilon, in_channels=in_channels,
**kwargs)
norm_layer = nn.LayerNorm(axis=axis, epsilon=epsilon, in_channels=in_channels,
**kwargs)
elif normalization == 'no_norm':
ln = NoNorm(in_channels=in_channels, **kwargs)
norm_layer = NoNorm(in_channels=in_channels, **kwargs)
elif normalization == 'identity':
norm_layer = IdentityActivation()
elif normalization == 'batch_norm':
norm_layer = nn.BatchNorm(axis=axis, epsilon=epsilon, in_channels=in_channels, **kwargs)
else:
raise NotImplementedError('normalization={} is not supported'.format(normalization))
return ln
return norm_layer
else:
raise NotImplementedError('The type of normalization must be str')

Expand Down Expand Up @@ -629,7 +635,7 @@ def __init__(self,
bias_initializer=bias_initializer,
dtype=dtype)
# TODO(sxjscience) We may need to set the dtype flag in LayerNorm, need to double check
self.layer_norm = get_layer_norm(normalization=normalization,
self.layer_norm = get_norm_layer(normalization=normalization,
in_channels=units,
epsilon=layer_norm_eps)

Expand Down
1 change: 1 addition & 0 deletions src/gluonnlp/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from . import albert
from . import bert
from . import electra
from . import gpt2
from . import mobilebert
from . import roberta
from . import transformer
Expand Down
15 changes: 12 additions & 3 deletions src/gluonnlp/models/albert.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,10 +336,12 @@ def __init__(self,
dtype=dtype)
if embed_size != units:
self.embed_factorized_proj = nn.Dense(units=units,
in_units=embed_size,
flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer)
self.embed_layer_norm = nn.LayerNorm(epsilon=self.layer_norm_eps)
self.embed_layer_norm = nn.LayerNorm(epsilon=self.layer_norm_eps,
in_channels=embed_size)
self.embed_dropout = nn.Dropout(hidden_dropout_prob)
# Construct token type embedding
self.token_type_embed = nn.Embedding(input_dim=num_token_types,
Expand Down Expand Up @@ -561,15 +563,18 @@ def __init__(self, backbone_cfg,
self.mlm_decoder = nn.HybridSequential()
# Extra non-linear layer
self.mlm_decoder.add(nn.Dense(units=self.backbone_model.embed_size,
in_units=self.backbone_model.units,
flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer))
self.mlm_decoder.add(get_activation(self.backbone_model.activation))
self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps))
self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps,
in_channels=self.backbone_model.embed_size))
# only load the dense weights with a re-initialized bias
# parameters are stored in 'word_embed_bias' which is
# not used in original embedding
self.mlm_decoder.add(nn.Dense(units=self.backbone_model.vocab_size,
in_units=self.backbone_model.embed_size,
flatten=False,
bias_initializer=bias_initializer))
self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight
Expand Down Expand Up @@ -649,19 +654,23 @@ def __init__(self, backbone_cfg,
bias_initializer = self.backbone_model.bias_initializer
# Construct sop_classifier for sentence order prediction
self.sop_classifier = nn.Dense(units=2,
in_units=self.backbone_model.units,
weight_initializer=weight_initializer)
self.mlm_decoder = nn.HybridSequential()
# Extra non-linear layer
self.mlm_decoder.add(nn.Dense(units=self.backbone_model.embed_size,
in_units=self.backbone_model.units,
flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer))
self.mlm_decoder.add(get_activation(self.backbone_model.activation))
self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps))
self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps,
in_channels=self.backbone_model.embed_size))
# only load the dense weights with a re-initialized bias
# parameters are stored in 'word_embed_bias' which is
# not used in original embedding
self.mlm_decoder.add(nn.Dense(units=self.backbone_model.vocab_size,
in_units=self.backbone_model.embed_size,
flatten=False,
bias_initializer=bias_initializer))
self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight
Expand Down
7 changes: 5 additions & 2 deletions src/gluonnlp/models/bart.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,13 +171,16 @@ def __init__(self,
if not extract_feature:
if self.tie_weights:
self.tgt_final_layer = \
nn.Dense(self._tgt_vocab_size, flatten=False,
nn.Dense(units=self._tgt_vocab_size,
in_units=self.dec_units,
flatten=False,
use_bias=False,
dtype=self._dtype)
self.tgt_final_layer.weight = self.tgt_embed_layer.weight
else:
self.tgt_final_layer = \
nn.Dense(self._tgt_vocab_size,
nn.Dense(units=self._tgt_vocab_size,
in_units=self.dec_units,
flatten=False,
weight_initializer=self.weight_initializer,
use_bias=False,
Expand Down
14 changes: 11 additions & 3 deletions src/gluonnlp/models/bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,8 @@ def __init__(self,
output_dim=units,
weight_initializer=embed_initializer,
dtype=dtype)
self.embed_layer_norm = nn.LayerNorm(epsilon=self.layer_norm_eps)
self.embed_layer_norm = nn.LayerNorm(epsilon=self.layer_norm_eps,
in_channels=units)
self.embed_dropout = nn.Dropout(hidden_dropout_prob)
# Construct token type embedding
self.token_type_embed = nn.Embedding(input_dim=num_token_types,
Expand Down Expand Up @@ -585,15 +586,18 @@ def __init__(self, backbone_cfg,
self.mlm_decoder = nn.HybridSequential()
# Extra non-linear layer
self.mlm_decoder.add(nn.Dense(units=self.backbone_model.units,
in_units=self.backbone_model.units,
flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer))
self.mlm_decoder.add(get_activation(self.backbone_model.activation))
self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps))
self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps,
in_channels=self.backbone_model.units))
# only load the dense weights with a re-initialized bias
# parameters are stored in 'word_embed_bias' which is
# not used in original embedding
self.mlm_decoder.add(nn.Dense(units=self.backbone_model.vocab_size,
in_units=self.backbone_model.units,
flatten=False,
bias_initializer=bias_initializer))
self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight
Expand Down Expand Up @@ -674,19 +678,23 @@ def __init__(self, backbone_cfg,
bias_initializer = self.backbone_model.bias_initializer
# Construct nsp_classifier for next sentence prediction
self.nsp_classifier = nn.Dense(units=2,
in_units=self.backbone_model.units,
weight_initializer=weight_initializer)
self.mlm_decoder = nn.HybridSequential()
# Extra non-linear layer
self.mlm_decoder.add(nn.Dense(units=self.backbone_model.units,
in_units=self.backbone_model.units,
flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer))
self.mlm_decoder.add(get_activation(self.backbone_model.activation))
self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps))
self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps,
in_channels=self.backbone_model.units))
# only load the dense weights with a re-initialized bias
# parameters are stored in 'word_embed_bias' which is
# not used in original embedding
self.mlm_decoder.add(nn.Dense(units=self.backbone_model.vocab_size,
in_units=self.backbone_model.units,
flatten=False,
bias_initializer=bias_initializer))
self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight
Expand Down
34 changes: 23 additions & 11 deletions src/gluonnlp/models/electra.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
'ElectraForPretrain', 'list_pretrained_electra', 'get_pretrained_electra']

import os
from typing import Tuple, Optional
from typing import Tuple, Optional, List

import mxnet as mx
import numpy as np
Expand Down Expand Up @@ -388,11 +388,13 @@ def __init__(self,
max_length=max_length,
dtype=self._dtype,
method=pos_embed_type)
self.embed_layer_norm = nn.LayerNorm(epsilon=self.layer_norm_eps)
self.embed_layer_norm = nn.LayerNorm(epsilon=self.layer_norm_eps,
in_channels=embed_size)

self.embed_dropout = nn.Dropout(hidden_dropout_prob)
if embed_size != units:
self.embed_factorized_proj = nn.Dense(units=units,
in_units=embed_size,
flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer)
Expand Down Expand Up @@ -509,25 +511,30 @@ def get_initial_embedding(self, F, inputs, token_types=None):
embedding = self.embed_dropout(embedding)
return embedding

def apply_layerwise_decay(self, layerwise_decay, not_included=None):
def apply_layerwise_decay(self, layerwise_decay: int,
not_included: Optional[List[str]] = None,
num_additional_layers: int = 2):
"""Apply the layer-wise gradient decay

.. math::
lr = lr * layerwise_decay^(max_depth - layer_depth)

Parameters:
----------
layerwise_decay: int
layer-wise decay power
not_included: list of str
layerwise_decay
Power rate of the layer-wise decay
not_included
A list or parameter names that not included in the layer-wise decay
num_additional_layers
The number of layers after the current backbone. This helps determine the max depth
"""

# consider the task specific finetuning layer as the last layer, following with pooler
# In addition, the embedding parameters have the smaller learning rate based on this setting.
max_depth = self.num_layers + 2
# Consider the task specific finetuning layer as the last layer, following with pooler
# In addition, the embedding parameters have the smaller learning rate based on this
# setting.
max_depth = self.num_layers + num_additional_layers
for _, value in self.collect_params('.*embed*').items():
value.lr_mult = layerwise_decay**(max_depth)
value.lr_mult = layerwise_decay ** max_depth

for (layer_depth, layer) in enumerate(self.encoder.all_encoder_layers):
layer_params = layer.collect_params()
Expand Down Expand Up @@ -630,11 +637,13 @@ def __init__(self, backbone_cfg,
self.rtd_encoder = nn.HybridSequential()
# Extra non-linear layer
self.rtd_encoder.add(nn.Dense(units=self.backbone_model.units,
in_units=self.backbone_model.units,
flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer))
self.rtd_encoder.add(get_activation(self.backbone_model.activation))
self.rtd_encoder.add(nn.Dense(units=1,
in_units=self.backbone_model.units,
flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer))
Expand Down Expand Up @@ -711,17 +720,20 @@ def __init__(self, backbone_cfg,
self.mlm_decoder = nn.HybridSequential()
# Extra non-linear layer
self.mlm_decoder.add(nn.Dense(units=self.backbone_model.embed_size,
in_units=self.backbone_model.units,
flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer))
self.mlm_decoder.add(get_activation(self.backbone_model.activation))
self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps))
self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps,
in_channels=self.backbone_model.units))
# only load the dense weights with a re-initialized bias
# parameters are stored in 'word_embed_bias' which is
# not used in original embedding
self.mlm_decoder.add(
nn.Dense(
units=self.backbone_model.vocab_size,
in_units=self.backbone_model.units,
flatten=False,
bias_initializer=bias_initializer))
self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight
Expand Down
Loading