From 04de79553ebddc0d2f1511477df28d6c40512e36 Mon Sep 17 00:00:00 2001 From: huhuiwen99 <53830712+huhuiwen99@users.noreply.github.com> Date: Sun, 10 Oct 2021 17:45:44 +0800 Subject: [PATCH] Modify Roformer Doc (#1104) * modify transforner-rst * modify roformer tokenizer * modify roformer model * update * modify transformer * modify roformer modeling * modify decoder * update * modify tokenizer * modify token_embedding --- paddlenlp/embeddings/token_embedding.py | 39 +- paddlenlp/transformers/bert/modeling.py | 2 +- paddlenlp/transformers/roformer/modeling.py | 406 ++++++++++++++++-- paddlenlp/transformers/roformer/tokenizer.py | 165 ++++--- .../transformers/transformer/modeling.py | 33 +- .../unified_transformer/modeling.py | 2 +- .../unified_transformer/tokenizer.py | 22 +- 7 files changed, 555 insertions(+), 114 deletions(-) diff --git a/paddlenlp/embeddings/token_embedding.py b/paddlenlp/embeddings/token_embedding.py index 8e0fe425d95a8..e223a992f8776 100644 --- a/paddlenlp/embeddings/token_embedding.py +++ b/paddlenlp/embeddings/token_embedding.py @@ -242,6 +242,14 @@ def search(self, words): Returns: `numpy.array`: The vectors of specifying words. + Examples: + .. code-block:: + + from paddlenlp.embeddings import TokenEmbedding + + embed = TokenEmbedding() + vector = embed.search('Welcome to use PaddlePaddle and PaddleNLP!') + """ idx_list = self.get_idx_list_from_words(words) idx_tensor = paddle.to_tensor(idx_list) @@ -271,6 +279,15 @@ def get_idx_list_from_words(self, words): Returns: `list`: The indexes list of specifying words. + Examples: + .. code-block:: + + from paddlenlp.embeddings import TokenEmbedding + + embed = TokenEmbedding() + index = embed.get_idx_from_word('Welcome to use PaddlePaddle and PaddleNLP!') + #635963 + """ if isinstance(words, str): idx_list = [self.get_idx_from_word(words)] @@ -305,7 +322,16 @@ def dot(self, word_a, word_b): word_b (`str`): The second word string. Returns: - `Float`: The dot product of 2 words. + float: The dot product of 2 words. + + Examples: + .. code-block:: + + from paddlenlp.embeddings import TokenEmbedding + + embed = TokenEmbedding() + dot_product = embed.dot('PaddlePaddle', 'PaddleNLP!') + #0.11827179 """ dot = self._dot_np @@ -321,7 +347,16 @@ def cosine_sim(self, word_a, word_b): word_b (`str`): The second word string. Returns: - `Float`: The cosine similarity of 2 words. + float: The cosine similarity of 2 words. + + Examples: + .. code-block:: + + from paddlenlp.embeddings import TokenEmbedding + + embed = TokenEmbedding() + cosine_simi = embed.cosine_sim('PaddlePaddle', 'PaddleNLP!') + #0.99999994 """ dot = self._dot_np diff --git a/paddlenlp/transformers/bert/modeling.py b/paddlenlp/transformers/bert/modeling.py index 46ba389663c0f..19d40f81234c7 100644 --- a/paddlenlp/transformers/bert/modeling.py +++ b/paddlenlp/transformers/bert/modeling.py @@ -484,7 +484,7 @@ def forward(self, tokenizer = BertTokenizer.from_pretrained('bert-wwm-chinese') model = BertModel.from_pretrained('bert-wwm-chinese') - inputs = tokenizer("欢迎使用百度飞浆!") + inputs = tokenizer("欢迎使用百度飞桨!") inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()} output = model(**inputs) ''' diff --git a/paddlenlp/transformers/roformer/modeling.py b/paddlenlp/transformers/roformer/modeling.py index e2c2c856039ac..0aed78a14a332 100644 --- a/paddlenlp/transformers/roformer/modeling.py +++ b/paddlenlp/transformers/roformer/modeling.py @@ -254,8 +254,8 @@ def forward(self, hidden_states): class RoFormerPretrainedModel(PretrainedModel): """ An abstract class for pretrained RoFormer models. It provides RoFormer related - `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`, - `pretrained_init_configuration`, `base_model_prefix` for downloading and + `model_config_file`, `pretrained_init_configuration`, `resource_files_names`, + `pretrained_resource_files_map`, `base_model_prefix` for downloading and loading pretrained models. See `PretrainedModel` for more details. """ @@ -471,47 +471,66 @@ def init_weights(self, layer): @register_base_model class RoFormerModel(RoFormerPretrainedModel): """ - The bare RoFormer Model transformer outputting raw hidden-states without any specific head on top. + The bare RoFormer Model transformer outputting raw hidden-states. This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`. - Check the superclass documentation for the generic methods and the library implements for all its model. + Refer to the superclass documentation for the generic methods. This model is also a Paddle `paddle.nn.Layer `__ subclass. Use it as a regular Paddle Layer and refer to the Paddle documentation for all matter related to general usage and behavior. Args: - vocab_size (`int`): - Vocabulary size of the RoFormerModel. Defines the number of different tokens that can - be represented by the `inputs_ids` passed when calling RoFormerModel. - embedding_size (`int`, optional): - Dimensionality of the embedding size. Defaults to ``768`` if not provided. - hidden_size (`int`, optional): - Dimensionality of the encoder layers and the pooler layer. Defaults to ``768``. - num_hidden_layers (`int`, optional): - Number of hidden layers in the Transformer encoder. Defaults to ``12``. - num_attention_heads (`int`, optional): + vocab_size (int): + Vocabulary size of `inputs_ids` in `RoFormerModel`. Also is the vocab size of token embedding matrix. + Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `RoFormerModel`. + embedding_size (int, optional): + Dimensionality of the embedding layer. Defaults to `768`. + hidden_size (int, optional): + Dimensionality of the, encoder layers and pooler layer. Defaults to `768`. + num_hidden_layers (int, optional): + Number of hidden layers in the Transformer encoder. Defaults to `12`. + num_attention_heads (int, optional): Number of attention heads for each attention layer in the Transformer encoder. - Defaults to ``12``. - intermediate_size (`int`, optional): - Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. - Defaults to ``3072``. - hidden_act (`str`, optional): + Defaults to `12`. + intermediate_size (int, optional): + Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors + to ff layers are firstly projected from `hidden_size` to `intermediate_size`, + and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`. + Defaults to `3072`. + hidden_act (str, optional): The non-linear activation function in the feed-forward layer. ``"gelu"``, ``"relu"`` and any other paddle supported activation functions - are supported. Defaults to ``"gelu"``. - hidden_dropout_prob (`float`, optional): + are supported. Defaults to `"gelu"`. + hidden_dropout_prob (float, optional): The dropout probability for all fully connected layers in the embeddings and encoder. - Defaults to ``0.1``. - attention_probs_dropout_prob (`float`, optional): - The dropout probability for all fully connected layers in the pooler. - Defaults to ``0.1``. - initializer_range (`float`, optional): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - Defaults to ``0.02``. + Defaults to `0.1`. + attention_probs_dropout_prob (float, optional): + The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target. + Defaults to `0.1`. + max_position_embeddings (int, optional): + The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input + sequence. Defaults to `512`. + type_vocab_size (int, optional): + The vocabulary size of `token_type_ids`. + Defaults to `2`. + initializer_range (float, optional): + The standard deviation of the normal initializer. + Defaults to 0.02. + + .. note:: + A normal_initializer initializes weight matrices as normal distributions. + See :meth:`BertPretrainedModel.init_weights()` for how weights are initialized in `BertModel`. + + pad_token_id (int, optional): + The index of padding token in the token vocabulary. + Defaults to `0`. + pool_act (str, optional): + The non-linear activation function in the pooler. + Defaults to `"tanh"`. rotary_value (`bool`, optional): - whether or not apply rotay position embeddings to value. - Defaults to ``False``. + Whether or not apply rotay position embeddings to value. + Defaults to `False`. """ def __init__( @@ -560,6 +579,75 @@ def forward( token_type_ids=None, attention_mask=None, output_hidden_states=False, ): + r''' + The RoFormerModel forward method, overrides the `__call__()` special method. + + Args: + input_ids (Tensor): + Indices of input sequence tokens in the vocabulary. They are + numerical representations of tokens that build the input sequence. + Its data type should be `int64` and it has a shape of [batch_size, sequence_length]. + token_type_ids (Tensor, optional): + Segment token indices to indicate different portions of the inputs. + Selected in the range ``[0, type_vocab_size - 1]``. + If `type_vocab_size` is 2, which means the inputs have two portions. + Indices can either be 0 or 1: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + Its data type should be `int64` and it has a shape of [batch_size, sequence_length]. + Defaults to `None`, which means we don't add segment embeddings. + position_ids(Tensor, optional): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, + max_position_embeddings - 1]``. + Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`. + attention_mask (Tensor, optional): + Mask used in multi-head attention to avoid performing attention on to some unwanted positions, + usually the paddings or the subsequent positions. + Its data type can be int, float and bool. + When the data type is bool, the `masked` tokens have `False` values and the others have `True` values. + When the data type is int, the `masked` tokens have `0` values and the others have `1` values. + When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values. + It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`. + Defaults to `None`, which means nothing needed to be prevented attention to. + output_hidden_states (bool, optional): + Whether to return the output of each hidden layers. + Defaults to `False`. + + Returns: + tuple: Returns tuple (`sequence_output`, `pooled_output`) or (`encoder_outputs`, `pooled_output`). + + With the fields: + + - `sequence_output` (Tensor): + Sequence of hidden-states at the last layer of the model. + It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size]. + + - `pooled_output` (Tensor): + The output of first token (`[CLS]`) in sequence. + We "pool" the model by simply taking the hidden state corresponding to the first token. + Its data type should be float32 and its shape is [batch_size, hidden_size]. + + - `encoder_outputs` (List(Tensor)): + A list of Tensor containing hidden-states of the model at each hidden layer in the Transformer encoder. + The length of the list is `num_hidden_layers`. + Each Tensor has a data type of float32 and its shape is [batch_size, sequence_length, hidden_size]. + + Example: + .. code-block:: + + import paddle + from paddlenlp.transformers import RoFormerModel, RoFormerTokenizer + + tokenizer = RoFormerTokenizer.from_pretrained('roformer-chinese-base') + model = RoFormerModel.from_pretrained('roformer-chinese-base') + + inputs = tokenizer("欢迎使用百度飞桨!") + inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()} + output = model(**inputs) + ''' + if attention_mask is None: attention_mask = paddle.unsqueeze( (input_ids == self.pad_token_id @@ -589,6 +677,20 @@ def forward( class RoFormerForQuestionAnswering(RoFormerPretrainedModel): + """ + RoFormer Model with a span classification head on top for extractive question-answering tasks like + SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and + `span end logits`). + + Args: + roformer (:class:`RoFormerModel`): + An instance of RoFormerModel. + dropout (float, optional): + The dropout probability for output of RoFormer. + If None, use the same value as `hidden_dropout_prob` of `RoFormerModel` + instance `roformer`. Defaults to `None`. + """ + def __init__(self, roformer, dropout=None): super(RoFormerForQuestionAnswering, self).__init__() self.roformer = roformer # allow roformer to be config @@ -596,6 +698,45 @@ def __init__(self, roformer, dropout=None): self.apply(self.init_weights) def forward(self, input_ids, token_type_ids=None): + r""" + The RoFormerForQuestionAnswering forward method, overrides the __call__() special method. + + Args: + input_ids (Tensor): + See :class:`RoFormerModel`. + token_type_ids (Tensor, optional): + See :class:`RoFormerModel`. + + Returns: + tuple: Returns tuple (`start_logits`, `end_logits`). + + With the fields: + + - `start_logits` (Tensor): + A tensor of the input token classification logits, indicates the start position of the labelled span. + Its data type should be float32 and its shape is [batch_size, sequence_length]. + + - `end_logits` (Tensor): + A tensor of the input token classification logits, indicates the end position of the labelled span. + Its data type should be float32 and its shape is [batch_size, sequence_length]. + + Example: + .. code-block:: + + import paddle + from paddlenlp.transformers import RoFormerForQuestionAnswering + from paddlenlp.transformers import RoFormerTokenizer + + tokenizer = RoFormerTokenizer.from_pretrained('roformer-chinese-base') + model = RoFormerForQuestionAnswering.from_pretrained('roformer-chinese-base') + + inputs = tokenizer("欢迎使用百度飞桨!") + inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()} + outputs = model(**inputs) + + start_logits = outputs[0] + end_logits =outputs[1] + """ sequence_output, _ = self.roformer( input_ids, token_type_ids=token_type_ids, @@ -610,13 +751,18 @@ def forward(self, input_ids, token_type_ids=None): class RoFormerForSequenceClassification(RoFormerPretrainedModel): """ - Model for sentence (pair) classification task with RoFormer. + RoFormer Model with a linear layer on top of the output layer, + designed for sequence classification/regression tasks like GLUE tasks. + Args: - roformer (RoFormerModel): An instance of RoFormerModel. - num_classes (int, optional): The number of classes. Default 2 - dropout (float, optional): The dropout probability for output of RoFormer. - If None, use the same value as `hidden_dropout_prob` of `RoFormerModel` - instance `roformer`. Default None + roformer (`RoFormerModel`): + An instance of `paddlenlp.transformers.RoFormerModel`. + num_classes (int, optional): + The number of classes. Default to `2`. + dropout (float, optional): + The dropout probability for output of RoFormer. + If None, use the same value as `hidden_dropout_prob` + of `paddlenlp.transformers.RoFormerModel` instance. Defaults to `None`. """ def __init__(self, roformer, num_classes=2, dropout=None): @@ -630,6 +776,33 @@ def __init__(self, roformer, num_classes=2, dropout=None): self.apply(self.init_weights) def forward(self, input_ids, token_type_ids=None, attention_mask=None): + r""" + Args: + input_ids (Tensor): + See :class:`RoFormerModel`. + token_type_ids (Tensor, optional): + See :class:`RoFormerModel`. + attention_mask (Tensor, optional): + See :class:`RoFormerModel`. + + Returns: + Tensor: Returns tensor `logits`, a tensor of the input text classification logits. + Shape as `[batch_size, num_classes]` and dtype as float32. + + Example: + .. code-block:: + + import paddle + from paddlenlp.transformers import RoFormerForSequenceClassification, RoFormerTokenizer + + tokenizer = RoFormerTokenizer.from_pretrained('roformer-chinese-base') + model = RoFormerForSequenceClassification.from_pretrained('roformer-chinese-base') + + inputs = tokenizer("欢迎使用百度飞桨!") + inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()} + logits = model(**inputs) + + """ _, pooled_output = self.roformer( input_ids, token_type_ids=token_type_ids, @@ -641,6 +814,21 @@ def forward(self, input_ids, token_type_ids=None, attention_mask=None): class RoFormerForTokenClassification(RoFormerPretrainedModel): + """ + RoFormer Model with a linear layer on top of the hidden-states output layer, + designed for token classification tasks like NER tasks. + + Args: + roformer (`RoFormerModel`): + An instance of `paddlenlp.transformers.RoFormerModel`. + num_classes (int, optional): + The number of classes. Default to `2`. + dropout (float, optional): + The dropout probability for output of RoFormer. + If None, use the same value as `hidden_dropout_prob` + of `paddlenlp.transformers.RoFormerModel` instance. Defaults to `None`. + """ + def __init__(self, roformer, num_classes=2, dropout=None): super(RoFormerForTokenClassification, self).__init__() self.num_classes = num_classes @@ -652,6 +840,33 @@ def __init__(self, roformer, num_classes=2, dropout=None): self.apply(self.init_weights) def forward(self, input_ids, token_type_ids=None, attention_mask=None): + r""" + Args: + input_ids (Tensor): + See :class:`RoFormerModel`. + token_type_ids (Tensor, optional): + See :class:`RoFormerModel`. + attention_mask (Tensor, optional): + See :class:`RoFormerModel`. + + Returns: + Tensor: Returns tensor `logits`, a tensor of the input token classification logits. + Shape as `[batch_size, sequence_length, num_classes]` and dtype as `float32`. + + Example: + .. code-block:: + + import paddle + from paddlenlp.transformers import RoFormerForTokenClassification, RoFormerTokenizer + + tokenizer = RoFormerTokenizer.from_pretrained('roformer-chinese-base') + model = RoFormerForTokenClassification.from_pretrained('roformer-chinese-base') + + inputs = tokenizer("欢迎使用百度飞桨!") + inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()} + logits = model(**inputs) + + """ sequence_output, _ = self.roformer( input_ids, token_type_ids=token_type_ids, @@ -698,6 +913,25 @@ def forward(self, hidden_states, masked_positions=None): class RoFormerPretrainingHeads(Layer): + """ + Perform language modeling task and next sentence classification task. + + Args: + hidden_size (int): + See :class:`RoFormerModel`. + hidden_size (int): + See :class:`RoFormerModel`. + vocab_size (int): + See :class:`RoFormerModel`. + activation (str): + Activation function used in the language modeling task. + embedding_weights (Tensor, optional): + Decoding weights used to map hidden_states to logits of the masked token prediction. + Its data type should be float32 and its shape is [vocab_size, hidden_size]. + Defaults to `None`, which means use the same weights of the embedding layer. + + """ + def __init__(self, embedding_size, hidden_size, @@ -711,12 +945,51 @@ def __init__(self, self.seq_relationship = nn.Linear(hidden_size, 2) def forward(self, sequence_output, pooled_output, masked_positions=None): + """ + Args: + sequence_output(Tensor): + Sequence of hidden-states at the last layer of the model. + It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size]. + pooled_output(Tensor): + The output of first token (`[CLS]`) in sequence. + We "pool" the model by simply taking the hidden state corresponding to the first token. + Its data type should be float32 and its shape is [batch_size, hidden_size]. + masked_positions(Tensor, optional): + A tensor indicates positions to be masked in the position embedding. + Its data type should be int64 and its shape is [batch_size, mask_token_num]. + `mask_token_num` is the number of masked tokens. It should be no bigger than `sequence_length`. + Defaults to `None`, which means we output hidden-states of all tokens in masked token prediction. + + Returns: + tuple: Returns tuple (``prediction_scores``, ``seq_relationship_score``). + + With the fields: + + - `prediction_scores` (Tensor): + The scores of masked token prediction. Its data type should be float32. + If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size]. + Otherwise, its shape is [batch_size, mask_token_num, vocab_size]. + + - `seq_relationship_score` (Tensor): + The scores of next sentence prediction. + Its data type should be float32 and its shape is [batch_size, 2]. + + """ prediction_scores = self.predictions(sequence_output, masked_positions) seq_relationship_score = self.seq_relationship(pooled_output) return prediction_scores, seq_relationship_score class RoFormerForPretraining(RoFormerPretrainedModel): + """ + RoFormer Model with pretraining tasks on top. + + Args: + roformer (:class:`RoFormerModel`): + An instance of :class:`RoFormerModel`. + + """ + def __init__(self, roformer): super(RoFormerForPretraining, self).__init__() self.roformer = roformer @@ -735,6 +1008,33 @@ def forward( token_type_ids=None, attention_mask=None, masked_positions=None, ): + r""" + + Args: + input_ids (Tensor): + See :class:`RoFormerModel`. + token_type_ids (Tensor, optional): + See :class:`RoFormerModel`. + attention_mask (Tensor, optional): + See :class:`RoFormerModel`. + masked_positions(Tensor, optional): + See :class:`RoFormerPretrainingHeads`. + + Returns: + tuple: Returns tuple (``prediction_scores``, ``seq_relationship_score``). + + With the fields: + + - `prediction_scores` (Tensor): + The scores of masked token prediction. Its data type should be float32. + If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size]. + Otherwise, its shape is [batch_size, mask_token_num, vocab_size]. + + - `seq_relationship_score` (Tensor): + The scores of next sentence prediction. + Its data type should be float32 and its shape is [batch_size, 2]. + + """ with paddle.static.amp.fp16_guard(): outputs = self.roformer( input_ids, @@ -747,6 +1047,14 @@ def forward( class RoFormerPretrainingCriterion(paddle.nn.Layer): + """ + Args: + vocab_size(int): + Vocabulary size of `inputs_ids` in `RoFormerModel`. Defines the number of different tokens that can + be represented by the `inputs_ids` passed when calling `RoFormerModel`. + + """ + def __init__(self, vocab_size): super(RoFormerPretrainingCriterion, self).__init__() # CrossEntropyLoss is expensive since the inner reshape (copy) @@ -760,6 +1068,32 @@ def forward( masked_lm_labels, next_sentence_labels, masked_lm_scale, ): + """ + Args: + prediction_scores(Tensor): + The scores of masked token prediction. Its data type should be float32. + If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size]. + Otherwise, its shape is [batch_size, mask_token_num, vocab_size] + seq_relationship_score(Tensor): + The scores of next sentence prediction. Its data type should be float32 and + its shape is [batch_size, 2] + masked_lm_labels(Tensor): + The labels of the masked language modeling, its dimensionality is equal to `prediction_scores`. + Its data type should be int64. If `masked_positions` is None, its shape is [batch_size, sequence_length, 1]. + Otherwise, its shape is [batch_size, mask_token_num, 1] + next_sentence_labels(Tensor): + The labels of the next sentence prediction task, the dimensionality of `next_sentence_labels` + is equal to `seq_relation_labels`. Its data type should be int64 and + its shape is [batch_size, 1] + masked_lm_scale(Tensor or int): + The scale of masked tokens. Used for the normalization of masked language modeling loss. + If it is a `Tensor`, its data type should be int64 and its shape is equal to `prediction_scores`. + + Returns: + Tensor: The pretraining loss, equals to the sum of `masked_lm_loss` plus the mean of `next_sentence_loss`. + Its data type should be float32 and its shape is [1]. + + """ with paddle.static.amp.fp16_guard(): masked_lm_loss = F.cross_entropy( prediction_scores, diff --git a/paddlenlp/transformers/roformer/tokenizer.py b/paddlenlp/transformers/roformer/tokenizer.py index 0d0111764bc59..09a041a329c37 100644 --- a/paddlenlp/transformers/roformer/tokenizer.py +++ b/paddlenlp/transformers/roformer/tokenizer.py @@ -24,12 +24,15 @@ class JiebaBasicTokenizer(BasicTokenizer): """ - Runs basic tokenization with jieba (punctuation splitting, lower casing, jieba pretokenizer etc.). + Runs basic tokenization with jieba (punctuation splitting, lower casing, jieba pretokenizer etc). + Args: - do_lower_case (bool): Whether the text strips accents and convert to - lower case. If you use the RoFormer Pretrained model, lower is set to - Flase when using the cased model, otherwise it is set to True. - Default: True. + vocab (:class:`paddlenlp.data.Vocab`): An instance of paddlenlp.data.Vocab. + do_lower_case (bool): + Whether the text strips accents and converts to lower case. + If you use the RoFormer Pretrained model, lower is set to + False when using the cased model, otherwise it is set to True. + Defaults to `True`. """ def __init__(self, vocab, do_lower_case=True): @@ -61,27 +64,48 @@ class RoFormerTokenizer(PretrainedTokenizer): Constructs a RoFormer tokenizer. It uses a basic tokenizer to do punctuation splitting, lower casing, jieba pretokenizer and so on, and follows a WordPiece tokenizer to tokenize as subwords. + Args: - vocab_file (str): file path of the vocabulary - do_lower_case (bool): Whether the text strips accents and convert to - lower case. If you use the RoFormer pretrained model, lower is set to - Flase when using the cased model, otherwise it is set to True. - Default: True. - use_jieba (bool): Whether or not to tokenize the text with jieba. Default: False. - unk_token (str): The special token for unkown words. Default: "[UNK]". - sep_token (str): The special token for separator token . Default: "[SEP]". - pad_token (str): The special token for padding. Default: "[PAD]". - cls_token (str): The special token for cls. Default: "[CLS]". - mask_token (str): The special token for mask. Default: "[MASK]". + vocab_file (str): + The vocabulary file path (ends with '.txt') required to instantiate + a `WordpieceTokenizer`. + do_lower_case (bool,optional): + Whether or not to lowercase the input when tokenizing. + If you use the RoFormer pretrained model, lower is set to + False when using the cased model, otherwise it is set to True. + Defaults to`True`. + use_jieba (bool,optional): + Whether or not to tokenize the text with jieba. Default: False. + unk_token (str,optional): + A special token representing the *unknown (out-of-vocabulary)* token. + An unknown token is set to be `unk_token` inorder to be converted to an ID. + Defaults to "[UNK]". + sep_token (str,optional): + A special token separating two different sentences in the same input. + Defaults to "[SEP]". + pad_token (str,optional): + A special token used to make arrays of tokens the same size for batching purposes. + Defaults to "[PAD]". + cls_token (str,optional): + A special token used for sequence classification. It is the last token + of the sequence when built with special tokens. Defaults to "[CLS]". + mask_token (str,optional): + A special token representing a masked token. This is the token used + in the masked language modeling task which the model tries to predict the original unmasked ones. + Defaults to "[MASK]". Examples: - .. code-block:: python - from paddlenlp.transformers.roformer import RoFormerTokenizer + .. code-block:: + + from paddlenlp.transformers import RoFormerTokenizer tokenizer = RoFormerTokenizer.from_pretrained('roformer-chinese-base') - # the following line get: ['今天', '的', '天气', '非常', '好', '!'] - tokens = tokenizer.tokenize('今天的天气非常好!') - # the following line get: '今天 的 天气 非常 好 !' - tokenizer.convert_tokens_to_string(tokens) + + tokens = tokenizer('欢迎使用百度飞桨') + ''' + {'input_ids': [101, 22355, 8994, 25854, 5438, 2473, 102], + 'token_type_ids': [0, 0, 0, 0, 0, 0, 0]} + ''' + """ resource_files_names = {"vocab_file": "vocab.txt"} # for save_pretrained @@ -185,9 +209,10 @@ def __init__( @property def vocab_size(self): """ - return the size of vocabulary. + Return the size of vocabulary. + Returns: - int: the size of vocabulary. + int: The size of vocabulary. """ return len(self.vocab) @@ -209,24 +234,47 @@ def _tokenize(self, text): def tokenize(self, text): """ - End-to-end tokenization for RoFormer models. + Converts a string to a list of tokens. + Args: text (str): The text to be tokenized. Returns: - list: A list of string representing converted tokens. + List(str): A list of string representing converted tokens. + + Examples: + .. code-block:: + + from paddlenlp.transformers import RoFormerTokenizer + + tokenizer = RoFormerTokenizer.from_pretrained('roformer-chinese-base') + tokens = tokenizer.tokenize('欢迎使用百度飞桨') + #['欢迎', '使用', '百度', '飞', '桨'] + """ return self._tokenize(text) def convert_tokens_to_string(self, tokens): """ - Converts a sequence of tokens (list of string) in a single string. Since - the usage of WordPiece introducing `##` to concat subwords, also remove - `##` when converting. + Converts a sequence of tokens (list of string) in a single string. + Args: tokens (list): A list of string representing tokens to be converted. + Returns: str: Converted string from tokens. + + Examples: + .. code-block:: + + from paddlenlp.transformers import RoFormerTokenizer + + tokenizer = RoFormerTokenizer.from_pretrained('roformer-chinese-base') + tokens = tokenizer.tokenize('欢迎使用百度飞桨') + #['欢迎', '使用', '百度', '飞', '桨'] + strings = tokenizer.convert_tokens_to_string(tokens) + #'欢迎 使用 百度 飞 桨' + """ out_string = " ".join(tokens).replace(" ##", "").strip() return out_string @@ -235,16 +283,13 @@ def num_special_tokens_to_add(self, pair=False): """ Returns the number of added tokens when encoding a sequence with special tokens. - Note: - This encodes inputs and checks the number of added tokens, and is therefore not efficient. Do not put this - inside your training loop. - Args: - pair: Returns the number of added tokens in the case of a sequence pair if set to True, returns the - number of added tokens in the case of a single sequence if set to False. + pair(bool): + Whether the input is a sequence pair or a single sequence. + Defaults to `False` and the input is a single sequence. Returns: - Number of tokens added to sequences + int: Number of tokens added to sequences. """ token_ids_0 = [] token_ids_1 = [] @@ -258,18 +303,18 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): adding special tokens. A Roformer sequence has the following format: - :: - - single sequence: ``[CLS] X [SEP]`` - - pair of sequences: ``[CLS] A [SEP] B [SEP]`` + + - single sequence: ``[CLS] X [SEP]`` + - pair of sequences: ``[CLS] A [SEP] B [SEP]`` Args: - token_ids_0 (:obj:`List[int]`): + token_ids_0 (List[int]): List of IDs to which the special tokens will be added. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. + token_ids_1 (List[int], optional): + Optional second list of IDs for sequence pairs. Defaults to None. Returns: - :obj:`List[int]`: List of input_id with the appropriate special tokens. + List[int]: List of input_id with the appropriate special tokens. """ if token_ids_1 is None: return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] @@ -284,18 +329,18 @@ def build_offset_mapping_with_special_tokens(self, Build offset map from a pair of offset map by concatenating and adding offsets of special tokens. A RoFormer offset_mapping has the following format: - :: - - single sequence: ``(0,0) X (0,0)`` - - pair of sequences: `(0,0) A (0,0) B (0,0)`` + + - single sequence: ``(0,0) X (0,0)`` + - pair of sequences: `(0,0) A (0,0) B (0,0)`` Args: - offset_mapping_ids_0 (:obj:`List[tuple]`): - List of char offsets to which the special tokens will be added. - offset_mapping_ids_1 (:obj:`List[tuple]`, `optional`): - Optional second list of char offsets for offset mapping pairs. + offset_mapping_ids_0 (List[tuple]): + List of wordpiece offsets to which the special tokens will be added. + offset_mapping_ids_1 (List[tuple], optional): + Optional second list of wordpiece offsets for offset mapping pairs. Defaults to None. Returns: - :obj:`List[tuple]`: List of char offsets with the appropriate offsets of special tokens. + List[tuple]: List of wordpiece offsets with the appropriate offsets of special tokens. """ if offset_mapping_1 is None: return [(0, 0)] + offset_mapping_0 + [(0, 0)] @@ -318,13 +363,13 @@ def create_token_type_ids_from_sequences(self, If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s). Args: - token_ids_0 (:obj:`List[int]`): - List of IDs. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. + token_ids_0 (List[int]): + A list of `inputs_ids` for the first sequence. + token_ids_1 (List[int], optional): + Optional second list of IDs for sequence pairs. Defaults to None. Returns: - :obj:`List[int]`: List of token_type_id according to the given sequence(s). + List[int]: List of token_type_id according to the given sequence(s). """ _sep = [self.sep_token_id] _cls = [self.cls_token_id] @@ -342,13 +387,15 @@ def get_special_tokens_mask(self, special tokens using the tokenizer ``encode`` methods. Args: - token_ids_0 (List[int]): List of ids of the first sequence. - token_ids_1 (List[int], optinal): List of ids of the second sequence. + token_ids_0 (List[int]): + A list of `inputs_ids` for the first sequence. + token_ids_1 (List[int], optional): + Optional second list of IDs for sequence pairs. Defaults to None. already_has_special_tokens (bool, optional): Whether or not the token list is already formatted with special tokens for the model. Defaults to None. Returns: - results (List[int]): The list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + List[int]: The list of integers either be 0 or 1: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: diff --git a/paddlenlp/transformers/transformer/modeling.py b/paddlenlp/transformers/transformer/modeling.py index 881bafcd3716e..c1fd8531689be 100644 --- a/paddlenlp/transformers/transformer/modeling.py +++ b/paddlenlp/transformers/transformer/modeling.py @@ -232,12 +232,12 @@ def forward(self, predict, label): The average loss of current batch whose data type can be float32, float64. The relation between `sum_cost` and `avg_cost` can be described as: - .. math: + .. math:: - avg_cost = sum_cost / token_num + avg\_cost = sum\_cost / token\_num - `token_num` (Tensor): - The number of tokens of current batch. + The number of tokens of current batch. Its data type can be float32, float64. Example: .. code-block:: @@ -493,6 +493,31 @@ def tile_beam_merge_with_batch(t, beam_size): t) def step(self, time, inputs, states, **kwargs): + """ + Perform a beam search decoding step, which uses cell to get probabilities, + and follows a beam search step to calculate scores and select candidate token ids. + + Args: + time(Tensor): An `int64` tensor with shape `[1]` provided by the caller, + representing the current time step number of decoding. + inputs(Tensor): A tensor variable. It is same as `initial_inputs` + returned by `initialize()` for the first decoding step and + `next_inputs` returned by `step()` for the others. + states(Tensor): A structure of tensor variables. + It is same as the `initial_cell_states` returned by `initialize()` + for the first decoding step and `next_states` returned by + `step()` for the others. + kwargs(dict, optional): Additional keyword arguments, provided by the caller `dynamic_decode`. + + Returns: + tuple: Returns tuple (``beam_search_output, beam_search_state, next_inputs, finished``). + `beam_search_state` and `next_inputs` have the same structure, + shape and data type as the input arguments states and inputs separately. + `beam_search_output` is a namedtuple(including scores, predicted_ids, parent_ids as fields) of tensor variables, + where `scores, predicted_ids, parent_ids` all has a tensor value shaped [batch_size, beam_size] with data type + float32, int64, int64. `finished` is a bool tensor with shape [batch_size, beam_size]. + + """ # Steps for decoding. # Compared to RNN, Transformer has 3D data at every decoding step inputs = paddle.reshape(inputs, [-1, 1]) # token @@ -628,7 +653,7 @@ class TransformerModel(nn.Layer): The dropout probability used in MHA to drop some attention target. If None, use the value of dropout. Defaults to None. act_dropout (float): - The dropout probability used after FFN activition. If None, use + The dropout probability used after FFN activation. If None, use the value of dropout. Defaults to None. bos_id (int, optional): The start token id and also be used as padding id. Defaults to 0. diff --git a/paddlenlp/transformers/unified_transformer/modeling.py b/paddlenlp/transformers/unified_transformer/modeling.py index 286a0b2dd76a0..4505c4e7b8e11 100644 --- a/paddlenlp/transformers/unified_transformer/modeling.py +++ b/paddlenlp/transformers/unified_transformer/modeling.py @@ -195,7 +195,7 @@ class UnifiedTransformerModel(UnifiedTransformerPretrainedModel): Defaults to 0.1. normalize_before (bool, optional): Indicate whether to put layer normalization into preprocessing of - MHA and FFN sub-layers. If True, pre-process is layer ormalization + MHA and FFN sub-layers. If True, pre-process is layer normalization and post-precess includes dropout, residual connection. Otherwise, no pre-process and post-precess includes dropout, residual connection, layer normalization. Defaults to True. diff --git a/paddlenlp/transformers/unified_transformer/tokenizer.py b/paddlenlp/transformers/unified_transformer/tokenizer.py index 428f0b284fe2a..9b0334d560e5a 100644 --- a/paddlenlp/transformers/unified_transformer/tokenizer.py +++ b/paddlenlp/transformers/unified_transformer/tokenizer.py @@ -48,7 +48,7 @@ class UnifiedTransformerTokenizer(PretrainedTokenizer): vocab_file (str): The path of file to construct vocabulary. sentencepiece_model_file (str): - The sentencepiece model file required to instantiate a + The sentencepiece model file (ends with '.spm') required to instantiate a `SentencePiece `__. do_lower_case (bool, optional): Whether or not to lowercase the input when tokenizing. Defaults to @@ -246,18 +246,18 @@ def _tokenize(self, text, is_split_into_words=True): def tokenize(self, text, is_split_into_words=True): """ - End-to-end tokenization for UnifiedTransformer models. + Converts a string to a list of tokens. Args: text (str): The text to be tokenized. - is_split_into_words (bool, optinal): + is_split_into_words (bool, optional): Whether or not the input `text` has been pretokenized. If False, the input `text` will be pretokenized by `jieba` firstly. Defaults to True. Returns: - list[str]: A list of string representing converted tokens. + List(str): A list of string representing converted tokens. Example: .. code-block:: @@ -265,8 +265,8 @@ def tokenize(self, text, is_split_into_words=True): from paddlenlp.transformers import UnifiedTransformerTokenizer tokenizer = UnifiedTransformerTokenizer.from_pretrained('plato-mini') - print(tokenizer.tokenize('我爱祖国', is_split_into_words=False)) - # ['▁我', '▁爱', '祖', '国'] + print(tokenizer.tokenize('欢迎使用百度飞桨!', is_split_into_words=False)) + # ['▁欢迎', '▁使用', '▁百度', '▁飞', '桨', '▁!'] """ return self._tokenize(text, is_split_into_words=is_split_into_words) @@ -307,10 +307,10 @@ def convert_tokens_to_string(self, tokens, keep_space=True): from paddlenlp.transformers import UnifiedTransformerTokenizer tokenizer = UnifiedTransformerTokenizer.from_pretrained('plato-mini') - print(tokenizer.convert_tokens_to_string(['▁我', '▁爱', '祖', '国'])) - # 我 爱祖国 - print(tokenizer.convert_tokens_to_string(['▁我', '▁爱', '祖', '国'], keep_space=False)) - # 我爱祖国 + print(tokenizer.convert_tokens_to_string(['▁欢迎', '▁使用', '▁百度', '▁飞', '桨', '▁!'])) + # 欢迎 使用 百度 飞桨 ! + print(tokenizer.convert_tokens_to_string(['▁欢迎', '▁使用', '▁百度', '▁飞', '桨', '▁!'], keep_space=False)) + # 欢迎使用百度飞桨! """ tokens = self.merge_subword(tokens) if keep_space: @@ -342,7 +342,7 @@ def convert_ids_to_string(self, ids, keep_space=True): from paddlenlp.transformers import UnifiedTransformerTokenizer tokenizer = UnifiedTransformerTokenizer.from_pretrained('plato-mini') - tokens = tokenizer.tokenize('我爱祖国', is_split_into_words=False) + tokens = tokenizer.tokenize('欢迎使用百度飞桨!', is_split_into_words=False) ids = tokenizer.convert_tokens_to_ids(tokens) print(ids) # [6, 121, 26907, 25475]