Skip to content

Commit

Permalink
add classifier_dropout to classification heads (#12794)
Browse files Browse the repository at this point in the history
* add classifier_dropout to Electra

* no type annotations yet

Co-authored-by: Sylvain Gugger <[email protected]>

* add classifier_dropout to Electra

* add classifier_dropout to Electra ForTokenClass.

* add classifier_dropout to bert

* add classifier_dropout to roberta

* add classifier_dropout to big_bird

* add classifier_dropout to mobilebert

* empty commit to trigger CI

* add classifier_dropout to reformer

* add classifier_dropout to ConvBERT

* add classifier_dropout to Albert

* add classifier_dropout to Albert

Co-authored-by: Sylvain Gugger <[email protected]>
  • Loading branch information
PhilipMay and sgugger authored Jul 26, 2021
1 parent 9ff672f commit 0c1c42c
Show file tree
Hide file tree
Showing 24 changed files with 179 additions and 34 deletions.
7 changes: 6 additions & 1 deletion src/transformers/models/albert/modeling_albert.py
Original file line number Diff line number Diff line change
Expand Up @@ -1088,7 +1088,12 @@ def __init__(self, config):
self.num_labels = config.num_labels

self.albert = AlbertModel(config, add_pooling_layer=False)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
classifier_dropout_prob = (
config.classifier_dropout_prob
if config.classifier_dropout_prob is not None
else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)

self.init_weights()
Expand Down
7 changes: 6 additions & 1 deletion src/transformers/models/albert/modeling_tf_albert.py
Original file line number Diff line number Diff line change
Expand Up @@ -1199,7 +1199,12 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
self.num_labels = config.num_labels

self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
classifier_dropout_prob = (
config.classifier_dropout_prob
if config.classifier_dropout_prob is not None
else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(rate=classifier_dropout_prob)
self.classifier = tf.keras.layers.Dense(
units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
Expand Down
4 changes: 4 additions & 0 deletions src/transformers/models/bert/configuration_bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ class BertConfig(PretrainedConfig):
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if ``config.is_decoder=True``.
classifier_dropout (:obj:`float`, `optional`):
The dropout ratio for the classification head.
Examples::
Expand Down Expand Up @@ -138,6 +140,7 @@ def __init__(
gradient_checkpointing=False,
position_embedding_type="absolute",
use_cache=True,
classifier_dropout=None,
**kwargs
):
super().__init__(pad_token_id=pad_token_id, **kwargs)
Expand All @@ -157,6 +160,7 @@ def __init__(
self.gradient_checkpointing = gradient_checkpointing
self.position_embedding_type = position_embedding_type
self.use_cache = use_cache
self.classifier_dropout = classifier_dropout


class BertOnnxConfig(OnnxConfig):
Expand Down
10 changes: 8 additions & 2 deletions src/transformers/models/bert/modeling_bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -1486,7 +1486,10 @@ def __init__(self, config):
self.config = config

self.bert = BertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)

self.init_weights()
Expand Down Expand Up @@ -1677,7 +1680,10 @@ def __init__(self, config):
self.num_labels = config.num_labels

self.bert = BertModel(config, add_pooling_layer=False)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)

self.init_weights()
Expand Down
14 changes: 12 additions & 2 deletions src/transformers/models/bert/modeling_flax_bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -915,7 +915,12 @@ class FlaxBertForSequenceClassificationModule(nn.Module):

def setup(self):
self.bert = FlaxBertModule(config=self.config, dtype=self.dtype)
self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
classifier_dropout = (
self.config.classifier_dropout
if self.config.classifier_dropout is not None
else self.config.hidden_dropout_prob
)
self.dropout = nn.Dropout(rate=classifier_dropout)
self.classifier = nn.Dense(
self.config.num_labels,
dtype=self.dtype,
Expand Down Expand Up @@ -1057,7 +1062,12 @@ class FlaxBertForTokenClassificationModule(nn.Module):

def setup(self):
self.bert = FlaxBertModule(config=self.config, dtype=self.dtype, add_pooling_layer=False)
self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
classifier_dropout = (
self.config.classifier_dropout
if self.config.classifier_dropout is not None
else self.config.hidden_dropout_prob
)
self.dropout = nn.Dropout(rate=classifier_dropout)
self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)

def __call__(
Expand Down
10 changes: 8 additions & 2 deletions src/transformers/models/bert/modeling_tf_bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -1386,7 +1386,10 @@ def __init__(self, config: BertConfig, *inputs, **kwargs):
self.num_labels = config.num_labels

self.bert = TFBertMainLayer(config, name="bert")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(rate=classifier_dropout)
self.classifier = tf.keras.layers.Dense(
units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
Expand Down Expand Up @@ -1652,7 +1655,10 @@ def __init__(self, config: BertConfig, *inputs, **kwargs):
self.num_labels = config.num_labels

self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert")
self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(rate=classifier_dropout)
self.classifier = tf.keras.layers.Dense(
units=config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
Expand Down
4 changes: 4 additions & 0 deletions src/transformers/models/big_bird/configuration_big_bird.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ class BigBirdConfig(PretrainedConfig):
"block_sparse"`.
gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
If True, use gradient checkpointing to save memory at the expense of slower backward pass.
classifier_dropout (:obj:`float`, `optional`):
The dropout ratio for the classification head.
Example::
Expand Down Expand Up @@ -126,6 +128,7 @@ def __init__(
block_size=64,
num_random_blocks=3,
gradient_checkpointing=False,
classifier_dropout=None,
**kwargs
):
super().__init__(
Expand Down Expand Up @@ -157,3 +160,4 @@ def __init__(
self.use_bias = use_bias
self.block_size = block_size
self.num_random_blocks = num_random_blocks
self.classifier_dropout = classifier_dropout
10 changes: 8 additions & 2 deletions src/transformers/models/big_bird/modeling_big_bird.py
Original file line number Diff line number Diff line change
Expand Up @@ -2605,7 +2605,10 @@ class BigBirdClassificationHead(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

self.config = config
Expand Down Expand Up @@ -2821,7 +2824,10 @@ def __init__(self, config):
self.num_labels = config.num_labels

self.bert = BigBirdModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)

self.init_weights()
Expand Down
14 changes: 12 additions & 2 deletions src/transformers/models/big_bird/modeling_flax_big_bird.py
Original file line number Diff line number Diff line change
Expand Up @@ -1654,7 +1654,12 @@ class FlaxBigBirdClassificationHead(nn.Module):

def setup(self):
self.dense = nn.Dense(self.config.hidden_size, dtype=self.dtype)
self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
classifier_dropout = (
self.config.classifier_dropout
if self.config.classifier_dropout is not None
else self.config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.out_proj = nn.Dense(self.config.num_labels, dtype=self.dtype)

def __call__(self, features, deterministic=True):
Expand Down Expand Up @@ -1831,7 +1836,12 @@ class FlaxBigBirdForTokenClassificationModule(nn.Module):

def setup(self):
self.bert = FlaxBigBirdModule(config=self.config, dtype=self.dtype, add_pooling_layer=False)
self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
classifier_dropout = (
self.config.classifier_dropout
if self.config.classifier_dropout is not None
else self.config.hidden_dropout_prob
)
self.dropout = nn.Dropout(rate=classifier_dropout)
self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)

def __call__(
Expand Down
5 changes: 4 additions & 1 deletion src/transformers/models/convbert/configuration_convbert.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ class ConvBertConfig(PretrainedConfig):
The number of groups for grouped linear layers for ConvBert model
conv_kernel_size (:obj:`int`, `optional`, defaults to 9):
The size of the convolutional kernel.
classifier_dropout (:obj:`float`, `optional`):
The dropout ratio for the classification head.
Example::
>>> from transformers import ConvBertModel, ConvBertConfig
Expand Down Expand Up @@ -108,6 +109,7 @@ def __init__(
head_ratio=2,
conv_kernel_size=9,
num_groups=1,
classifier_dropout=None,
**kwargs,
):
super().__init__(
Expand All @@ -134,3 +136,4 @@ def __init__(
self.head_ratio = head_ratio
self.conv_kernel_size = conv_kernel_size
self.num_groups = num_groups
self.classifier_dropout = classifier_dropout
10 changes: 8 additions & 2 deletions src/transformers/models/convbert/modeling_convbert.py
Original file line number Diff line number Diff line change
Expand Up @@ -936,7 +936,10 @@ class ConvBertClassificationHead(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

self.config = config
Expand Down Expand Up @@ -1152,7 +1155,10 @@ def __init__(self, config):
self.num_labels = config.num_labels

self.convbert = ConvBertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)

self.init_weights()
Expand Down
10 changes: 8 additions & 2 deletions src/transformers/models/convbert/modeling_tf_convbert.py
Original file line number Diff line number Diff line change
Expand Up @@ -970,7 +970,10 @@ def __init__(self, config, **kwargs):
self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(classifier_dropout)
self.out_proj = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
)
Expand Down Expand Up @@ -1240,7 +1243,10 @@ def __init__(self, config, *inputs, **kwargs):

self.num_labels = config.num_labels
self.convbert = TFConvBertMainLayer(config, name="convbert")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(classifier_dropout)
self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
Expand Down
4 changes: 4 additions & 0 deletions src/transformers/models/electra/configuration_electra.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ class ElectraConfig(PretrainedConfig):
<https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
`Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
<https://arxiv.org/abs/2009.13658>`__.
classifier_dropout (:obj:`float`, `optional`):
The dropout ratio for the classification head.
Examples::
Expand Down Expand Up @@ -141,6 +143,7 @@ def __init__(
summary_last_dropout=0.1,
pad_token_id=0,
position_embedding_type="absolute",
classifier_dropout=None,
**kwargs
):
super().__init__(pad_token_id=pad_token_id, **kwargs)
Expand All @@ -164,3 +167,4 @@ def __init__(
self.summary_activation = summary_activation
self.summary_last_dropout = summary_last_dropout
self.position_embedding_type = position_embedding_type
self.classifier_dropout = classifier_dropout
10 changes: 8 additions & 2 deletions src/transformers/models/electra/modeling_electra.py
Original file line number Diff line number Diff line change
Expand Up @@ -900,7 +900,10 @@ class ElectraClassificationHead(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

def forward(self, features, **kwargs):
Expand Down Expand Up @@ -1200,7 +1203,10 @@ def __init__(self, config):
super().__init__(config)

self.electra = ElectraModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()

Expand Down
14 changes: 12 additions & 2 deletions src/transformers/models/electra/modeling_flax_electra.py
Original file line number Diff line number Diff line change
Expand Up @@ -783,7 +783,12 @@ class FlaxElectraForTokenClassificationModule(nn.Module):

def setup(self):
self.electra = FlaxElectraModule(config=self.config, dtype=self.dtype)
self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
classifier_dropout = (
self.config.classifier_dropout
if self.config.classifier_dropout is not None
else self.config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Dense(self.config.num_labels)

def __call__(
Expand Down Expand Up @@ -1069,7 +1074,12 @@ class FlaxElectraClassificationHead(nn.Module):

def setup(self):
self.dense = nn.Dense(self.config.hidden_size, dtype=self.dtype)
self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
classifier_dropout = (
self.config.classifier_dropout
if self.config.classifier_dropout is not None
else self.config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.out_proj = nn.Dense(self.config.num_labels, dtype=self.dtype)

def __call__(self, hidden_states, deterministic: bool = True):
Expand Down
12 changes: 10 additions & 2 deletions src/transformers/models/electra/modeling_tf_electra.py
Original file line number Diff line number Diff line change
Expand Up @@ -1039,7 +1039,12 @@ def __init__(self, config, **kwargs):
self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
classifier_dropout = (
config.classifhidden_dropout_probier_dropout
if config.classifier_dropout is not None
else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(classifier_dropout)
self.out_proj = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
)
Expand Down Expand Up @@ -1309,7 +1314,10 @@ def __init__(self, config, **kwargs):
super().__init__(config, **kwargs)

self.electra = TFElectraMainLayer(config, name="electra")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = tf.keras.layers.Dropout(classifier_dropout)
self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ class MobileBertConfig(PretrainedConfig):
Number of FFNs in a block.
normalization_type (:obj:`str`, `optional`, defaults to :obj:`"no_norm"`):
The normalization type in MobileBERT.
classifier_dropout (:obj:`float`, `optional`):
The dropout ratio for the classification head.
Examples::
Expand Down Expand Up @@ -128,6 +130,7 @@ def __init__(
num_feedforward_networks=4,
normalization_type="no_norm",
classifier_activation=True,
classifier_dropout=None,
**kwargs
):
super().__init__(pad_token_id=pad_token_id, **kwargs)
Expand Down Expand Up @@ -158,3 +161,5 @@ def __init__(
self.true_hidden_size = intra_bottleneck_size
else:
self.true_hidden_size = hidden_size

self.classifier_dropout = classifier_dropout
Loading

0 comments on commit 0c1c42c

Please sign in to comment.