From 04de79553ebddc0d2f1511477df28d6c40512e36 Mon Sep 17 00:00:00 2001
From: huhuiwen99 <53830712+huhuiwen99@users.noreply.github.com>
Date: Sun, 10 Oct 2021 17:45:44 +0800
Subject: [PATCH] Modify Roformer Doc (#1104)

* modify transforner-rst

* modify roformer tokenizer

* modify roformer model

* update

* modify transformer

* modify roformer modeling

* modify decoder

* update

* modify tokenizer

* modify token_embedding
---
 paddlenlp/embeddings/token_embedding.py       |  39 +-
 paddlenlp/transformers/bert/modeling.py       |   2 +-
 paddlenlp/transformers/roformer/modeling.py   | 406 ++++++++++++++++--
 paddlenlp/transformers/roformer/tokenizer.py  | 165 ++++---
 .../transformers/transformer/modeling.py      |  33 +-
 .../unified_transformer/modeling.py           |   2 +-
 .../unified_transformer/tokenizer.py          |  22 +-
 7 files changed, 555 insertions(+), 114 deletions(-)

diff --git a/paddlenlp/embeddings/token_embedding.py b/paddlenlp/embeddings/token_embedding.py
index 8e0fe425d95a8..e223a992f8776 100644
--- a/paddlenlp/embeddings/token_embedding.py
+++ b/paddlenlp/embeddings/token_embedding.py
@@ -242,6 +242,14 @@ def search(self, words):
         Returns:
             `numpy.array`: The vectors of specifying words.
 
+        Examples:
+            .. code-block::
+
+                from paddlenlp.embeddings import TokenEmbedding
+
+                embed = TokenEmbedding()
+                vector =  embed.search('Welcome to use PaddlePaddle and PaddleNLP!')
+
         """
         idx_list = self.get_idx_list_from_words(words)
         idx_tensor = paddle.to_tensor(idx_list)
@@ -271,6 +279,15 @@ def get_idx_list_from_words(self, words):
         Returns:
             `list`: The indexes list of specifying words.
 
+        Examples:
+            .. code-block::
+
+                from paddlenlp.embeddings import TokenEmbedding
+
+                embed = TokenEmbedding()
+                index =  embed.get_idx_from_word('Welcome to use PaddlePaddle and PaddleNLP!')
+                #635963
+
         """
         if isinstance(words, str):
             idx_list = [self.get_idx_from_word(words)]
@@ -305,7 +322,16 @@ def dot(self, word_a, word_b):
             word_b (`str`): The second word string.
 
         Returns:
-            `Float`: The dot product of 2 words.
+            float: The dot product of 2 words.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.embeddings import TokenEmbedding
+
+                embed = TokenEmbedding()
+                dot_product =  embed.dot('PaddlePaddle', 'PaddleNLP!')
+                #0.11827179
 
         """
         dot = self._dot_np
@@ -321,7 +347,16 @@ def cosine_sim(self, word_a, word_b):
             word_b (`str`): The second word string.
 
         Returns:
-            `Float`: The cosine similarity of 2 words.
+            float: The cosine similarity of 2 words.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.embeddings import TokenEmbedding
+
+                embed = TokenEmbedding()
+                cosine_simi =  embed.cosine_sim('PaddlePaddle', 'PaddleNLP!')
+                #0.99999994
 
         """
         dot = self._dot_np
diff --git a/paddlenlp/transformers/bert/modeling.py b/paddlenlp/transformers/bert/modeling.py
index 46ba389663c0f..19d40f81234c7 100644
--- a/paddlenlp/transformers/bert/modeling.py
+++ b/paddlenlp/transformers/bert/modeling.py
@@ -484,7 +484,7 @@ def forward(self,
                 tokenizer = BertTokenizer.from_pretrained('bert-wwm-chinese')
                 model = BertModel.from_pretrained('bert-wwm-chinese')
 
-                inputs = tokenizer("欢迎使用百度飞浆!")
+                inputs = tokenizer("欢迎使用百度飞桨!")
                 inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
                 output = model(**inputs)
         '''
diff --git a/paddlenlp/transformers/roformer/modeling.py b/paddlenlp/transformers/roformer/modeling.py
index e2c2c856039ac..0aed78a14a332 100644
--- a/paddlenlp/transformers/roformer/modeling.py
+++ b/paddlenlp/transformers/roformer/modeling.py
@@ -254,8 +254,8 @@ def forward(self, hidden_states):
 class RoFormerPretrainedModel(PretrainedModel):
     """
     An abstract class for pretrained RoFormer models. It provides RoFormer related
-    `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
-    `pretrained_init_configuration`, `base_model_prefix` for downloading and
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading and
     loading pretrained models. See `PretrainedModel` for more details.
     """
 
@@ -471,47 +471,66 @@ def init_weights(self, layer):
 @register_base_model
 class RoFormerModel(RoFormerPretrainedModel):
     """
-    The bare RoFormer Model transformer outputting raw hidden-states without any specific head on top.
+    The bare RoFormer Model transformer outputting raw hidden-states.
 
     This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
-    Check the superclass documentation for the generic methods and the library implements for all its model.
+    Refer to the superclass documentation for the generic methods.
 
     This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
     /docs/en/api/paddle/fluid/dygraph/layers/Layer_en.html>`__ subclass. Use it as a regular Paddle Layer
     and refer to the Paddle documentation for all matter related to general usage and behavior.
 
     Args:
-        vocab_size (`int`):
-            Vocabulary size of the RoFormerModel. Defines the number of different tokens that can
-            be represented by the `inputs_ids` passed when calling RoFormerModel.
-        embedding_size (`int`, optional):
-            Dimensionality of the embedding size. Defaults to ``768`` if not provided.
-        hidden_size (`int`, optional):
-            Dimensionality of the encoder layers and the pooler layer. Defaults to ``768``.
-        num_hidden_layers (`int`, optional):
-            Number of hidden layers in the Transformer encoder. Defaults to ``12``.
-        num_attention_heads (`int`, optional):
+        vocab_size (int):
+            Vocabulary size of `inputs_ids` in `RoFormerModel`. Also is the vocab size of token embedding matrix.
+            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `RoFormerModel`.
+        embedding_size (int, optional):
+            Dimensionality of the embedding layer. Defaults to `768`.
+        hidden_size (int, optional):
+            Dimensionality of the, encoder layers and pooler layer. Defaults to `768`.
+        num_hidden_layers (int, optional):
+            Number of hidden layers in the Transformer encoder. Defaults to `12`.
+        num_attention_heads (int, optional):
             Number of attention heads for each attention layer in the Transformer encoder.
-            Defaults to ``12``.
-        intermediate_size (`int`, optional):
-            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
-            Defaults to ``3072``.
-        hidden_act (`str`, optional):
+            Defaults to `12`.
+        intermediate_size (int, optional):
+            Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors
+            to ff layers are firstly projected from `hidden_size` to `intermediate_size`,
+            and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.
+            Defaults to `3072`.
+        hidden_act (str, optional):
             The non-linear activation function in the feed-forward layer.
             ``"gelu"``, ``"relu"`` and any other paddle supported activation functions
-            are supported. Defaults to ``"gelu"``.
-        hidden_dropout_prob (`float`, optional):
+            are supported. Defaults to `"gelu"`.
+        hidden_dropout_prob (float, optional):
             The dropout probability for all fully connected layers in the embeddings and encoder.
-            Defaults to ``0.1``.
-        attention_probs_dropout_prob (`float`, optional):
-            The dropout probability for all fully connected layers in the pooler.
-            Defaults to ``0.1``.
-        initializer_range (`float`, optional):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            Defaults to ``0.02``.
+            Defaults to `0.1`.
+        attention_probs_dropout_prob (float, optional):
+            The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.
+            Defaults to `0.1`.
+        max_position_embeddings (int, optional):
+            The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input
+            sequence. Defaults to `512`.
+        type_vocab_size (int, optional):
+            The vocabulary size of `token_type_ids`.
+            Defaults to `2`.
+        initializer_range (float, optional):
+            The standard deviation of the normal initializer.
+            Defaults to 0.02.
+
+            .. note::
+                A normal_initializer initializes weight matrices as normal distributions.
+                See :meth:`BertPretrainedModel.init_weights()` for how weights are initialized in `BertModel`.
+
+        pad_token_id (int, optional):
+            The index of padding token in the token vocabulary.
+            Defaults to `0`.
+        pool_act (str, optional):
+            The non-linear activation function in the pooler.
+            Defaults to `"tanh"`.
         rotary_value (`bool`, optional):
-            whether or not apply rotay position embeddings to value.
-            Defaults to ``False``.            
+            Whether or not apply rotay position embeddings to value.
+            Defaults to `False`.
     """
 
     def __init__(
@@ -560,6 +579,75 @@ def forward(
             token_type_ids=None,
             attention_mask=None,
             output_hidden_states=False, ):
+        r'''
+        The RoFormerModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            position_ids(Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+                max_position_embeddings - 1]``.
+                Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            output_hidden_states (bool, optional):
+                Whether to return the output of each hidden layers.
+                Defaults to `False`.
+
+        Returns:
+            tuple: Returns tuple (`sequence_output`, `pooled_output`) or (`encoder_outputs`, `pooled_output`).
+
+            With the fields:
+
+            - `sequence_output` (Tensor):
+                Sequence of hidden-states at the last layer of the model.
+                It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+
+            - `pooled_output` (Tensor):
+                The output of first token (`[CLS]`) in sequence.
+                We "pool" the model by simply taking the hidden state corresponding to the first token.
+                Its data type should be float32 and its shape is [batch_size, hidden_size].
+
+            - `encoder_outputs` (List(Tensor)):
+                A list of Tensor containing hidden-states of the model at each hidden layer in the Transformer encoder.
+                The length of the list is `num_hidden_layers`.
+                Each Tensor has a data type of float32 and its shape is [batch_size, sequence_length, hidden_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import RoFormerModel, RoFormerTokenizer
+
+                tokenizer = RoFormerTokenizer.from_pretrained('roformer-chinese-base')
+                model = RoFormerModel.from_pretrained('roformer-chinese-base')
+
+                inputs = tokenizer("欢迎使用百度飞桨!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs)
+        '''
+
         if attention_mask is None:
             attention_mask = paddle.unsqueeze(
                 (input_ids == self.pad_token_id
@@ -589,6 +677,20 @@ def forward(
 
 
 class RoFormerForQuestionAnswering(RoFormerPretrainedModel):
+    """
+    RoFormer Model with a span classification head on top for extractive question-answering tasks like
+    SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and
+    `span end logits`).
+
+    Args:
+        roformer (:class:`RoFormerModel`):
+            An instance of RoFormerModel.
+        dropout (float, optional):
+            The dropout probability for output of RoFormer.
+            If None, use the same value as `hidden_dropout_prob` of `RoFormerModel`
+            instance `roformer`. Defaults to `None`.
+        """
+
     def __init__(self, roformer, dropout=None):
         super(RoFormerForQuestionAnswering, self).__init__()
         self.roformer = roformer  # allow roformer to be config
@@ -596,6 +698,45 @@ def __init__(self, roformer, dropout=None):
         self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None):
+        r"""
+        The RoFormerForQuestionAnswering forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`RoFormerModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`RoFormerModel`.
+
+        Returns:
+            tuple: Returns tuple (`start_logits`, `end_logits`).
+
+            With the fields:
+
+            - `start_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the start position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+            - `end_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the end position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import RoFormerForQuestionAnswering
+                from paddlenlp.transformers import RoFormerTokenizer
+
+                tokenizer = RoFormerTokenizer.from_pretrained('roformer-chinese-base')
+                model = RoFormerForQuestionAnswering.from_pretrained('roformer-chinese-base')
+
+                inputs = tokenizer("欢迎使用百度飞桨!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+
+                start_logits = outputs[0]
+                end_logits  =outputs[1]
+        """
         sequence_output, _ = self.roformer(
             input_ids,
             token_type_ids=token_type_ids,
@@ -610,13 +751,18 @@ def forward(self, input_ids, token_type_ids=None):
 
 class RoFormerForSequenceClassification(RoFormerPretrainedModel):
     """
-    Model for sentence (pair) classification task with RoFormer.
+    RoFormer Model with a linear layer on top of the output layer,
+    designed for sequence classification/regression tasks like GLUE tasks.
+
     Args:
-        roformer (RoFormerModel): An instance of RoFormerModel.
-        num_classes (int, optional): The number of classes. Default 2
-        dropout (float, optional): The dropout probability for output of RoFormer.
-            If None, use the same value as `hidden_dropout_prob` of `RoFormerModel`
-            instance `roformer`. Default None
+        roformer (`RoFormerModel`):
+            An instance of `paddlenlp.transformers.RoFormerModel`.
+        num_classes (int, optional):
+            The number of classes. Default to `2`.
+        dropout (float, optional):
+            The dropout probability for output of RoFormer.
+            If None, use the same value as `hidden_dropout_prob`
+            of `paddlenlp.transformers.RoFormerModel` instance. Defaults to `None`.
     """
 
     def __init__(self, roformer, num_classes=2, dropout=None):
@@ -630,6 +776,33 @@ def __init__(self, roformer, num_classes=2, dropout=None):
         self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None):
+        r"""
+        Args:
+            input_ids (Tensor):
+                See :class:`RoFormerModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`RoFormerModel`.
+            attention_mask (Tensor, optional):
+                See :class:`RoFormerModel`.
+
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the input text classification logits.
+            Shape as `[batch_size, num_classes]` and dtype as float32.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import RoFormerForSequenceClassification, RoFormerTokenizer
+
+                tokenizer = RoFormerTokenizer.from_pretrained('roformer-chinese-base')
+                model = RoFormerForSequenceClassification.from_pretrained('roformer-chinese-base')
+
+                inputs = tokenizer("欢迎使用百度飞桨!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+
+        """
         _, pooled_output = self.roformer(
             input_ids,
             token_type_ids=token_type_ids,
@@ -641,6 +814,21 @@ def forward(self, input_ids, token_type_ids=None, attention_mask=None):
 
 
 class RoFormerForTokenClassification(RoFormerPretrainedModel):
+    """
+    RoFormer Model with a linear layer on top of the hidden-states output layer,
+    designed for token classification tasks like NER tasks.
+
+    Args:
+        roformer (`RoFormerModel`):
+            An instance of `paddlenlp.transformers.RoFormerModel`.
+        num_classes (int, optional):
+            The number of classes. Default to `2`.
+        dropout (float, optional):
+            The dropout probability for output of RoFormer.
+            If None, use the same value as `hidden_dropout_prob`
+            of `paddlenlp.transformers.RoFormerModel` instance. Defaults to `None`.
+    """
+
     def __init__(self, roformer, num_classes=2, dropout=None):
         super(RoFormerForTokenClassification, self).__init__()
         self.num_classes = num_classes
@@ -652,6 +840,33 @@ def __init__(self, roformer, num_classes=2, dropout=None):
         self.apply(self.init_weights)
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None):
+        r"""
+        Args:
+            input_ids (Tensor):
+                See :class:`RoFormerModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`RoFormerModel`.
+            attention_mask (Tensor, optional):
+                See :class:`RoFormerModel`.
+
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the input token classification logits.
+            Shape as `[batch_size, sequence_length, num_classes]` and dtype as `float32`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import RoFormerForTokenClassification, RoFormerTokenizer
+
+                tokenizer = RoFormerTokenizer.from_pretrained('roformer-chinese-base')
+                model = RoFormerForTokenClassification.from_pretrained('roformer-chinese-base')
+
+                inputs = tokenizer("欢迎使用百度飞桨!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+
+        """
         sequence_output, _ = self.roformer(
             input_ids,
             token_type_ids=token_type_ids,
@@ -698,6 +913,25 @@ def forward(self, hidden_states, masked_positions=None):
 
 
 class RoFormerPretrainingHeads(Layer):
+    """
+    Perform language modeling task and next sentence classification task.
+
+    Args:
+        hidden_size (int):
+            See :class:`RoFormerModel`.
+        hidden_size (int):
+            See :class:`RoFormerModel`.
+        vocab_size (int):
+            See :class:`RoFormerModel`.
+        activation (str):
+            Activation function used in the language modeling task.
+        embedding_weights (Tensor, optional):
+            Decoding weights used to map hidden_states to logits of the masked token prediction.
+            Its data type should be float32 and its shape is [vocab_size, hidden_size].
+            Defaults to `None`, which means use the same weights of the embedding layer.
+
+    """
+
     def __init__(self,
                  embedding_size,
                  hidden_size,
@@ -711,12 +945,51 @@ def __init__(self,
         self.seq_relationship = nn.Linear(hidden_size, 2)
 
     def forward(self, sequence_output, pooled_output, masked_positions=None):
+        """
+        Args:
+            sequence_output(Tensor):
+                Sequence of hidden-states at the last layer of the model.
+                It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+            pooled_output(Tensor):
+                The output of first token (`[CLS]`) in sequence.
+                We "pool" the model by simply taking the hidden state corresponding to the first token.
+                Its data type should be float32 and its shape is [batch_size, hidden_size].
+            masked_positions(Tensor, optional):
+                A tensor indicates positions to be masked in the position embedding.
+                Its data type should be int64 and its shape is [batch_size, mask_token_num].
+                `mask_token_num` is the number of masked tokens. It should be no bigger than `sequence_length`.
+                Defaults to `None`, which means we output hidden-states of all tokens in masked token prediction.
+
+        Returns:
+            tuple: Returns tuple (``prediction_scores``, ``seq_relationship_score``).
+
+            With the fields:
+
+            - `prediction_scores` (Tensor):
+                The scores of masked token prediction. Its data type should be float32.
+                If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size].
+                Otherwise, its shape is [batch_size, mask_token_num, vocab_size].
+
+            - `seq_relationship_score` (Tensor):
+                The scores of next sentence prediction.
+                Its data type should be float32 and its shape is [batch_size, 2].
+
+        """
         prediction_scores = self.predictions(sequence_output, masked_positions)
         seq_relationship_score = self.seq_relationship(pooled_output)
         return prediction_scores, seq_relationship_score
 
 
 class RoFormerForPretraining(RoFormerPretrainedModel):
+    """
+    RoFormer Model with pretraining tasks on top.
+
+    Args:
+        roformer (:class:`RoFormerModel`):
+            An instance of :class:`RoFormerModel`.
+
+    """
+
     def __init__(self, roformer):
         super(RoFormerForPretraining, self).__init__()
         self.roformer = roformer
@@ -735,6 +1008,33 @@ def forward(
             token_type_ids=None,
             attention_mask=None,
             masked_positions=None, ):
+        r"""
+
+        Args:
+            input_ids (Tensor):
+                See :class:`RoFormerModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`RoFormerModel`.
+            attention_mask (Tensor, optional):
+                See :class:`RoFormerModel`.
+            masked_positions(Tensor, optional):
+                See :class:`RoFormerPretrainingHeads`.
+
+        Returns:
+            tuple: Returns tuple (``prediction_scores``, ``seq_relationship_score``).
+
+            With the fields:
+
+            - `prediction_scores` (Tensor):
+                The scores of masked token prediction. Its data type should be float32.
+                If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size].
+                Otherwise, its shape is [batch_size, mask_token_num, vocab_size].
+
+            - `seq_relationship_score` (Tensor):
+                The scores of next sentence prediction.
+                Its data type should be float32 and its shape is [batch_size, 2].
+
+        """
         with paddle.static.amp.fp16_guard():
             outputs = self.roformer(
                 input_ids,
@@ -747,6 +1047,14 @@ def forward(
 
 
 class RoFormerPretrainingCriterion(paddle.nn.Layer):
+    """
+    Args:
+        vocab_size(int):
+            Vocabulary size of `inputs_ids` in `RoFormerModel`. Defines the number of different tokens that can
+            be represented by the `inputs_ids` passed when calling `RoFormerModel`.
+
+    """
+
     def __init__(self, vocab_size):
         super(RoFormerPretrainingCriterion, self).__init__()
         # CrossEntropyLoss is expensive since the inner reshape (copy)
@@ -760,6 +1068,32 @@ def forward(
             masked_lm_labels,
             next_sentence_labels,
             masked_lm_scale, ):
+        """
+        Args:
+            prediction_scores(Tensor):
+                The scores of masked token prediction. Its data type should be float32.
+                If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size].
+                Otherwise, its shape is [batch_size, mask_token_num, vocab_size]
+            seq_relationship_score(Tensor):
+                The scores of next sentence prediction. Its data type should be float32 and
+                its shape is [batch_size, 2]
+            masked_lm_labels(Tensor):
+                The labels of the masked language modeling, its dimensionality is equal to `prediction_scores`.
+                Its data type should be int64. If `masked_positions` is None, its shape is [batch_size, sequence_length, 1].
+                Otherwise, its shape is [batch_size, mask_token_num, 1]
+            next_sentence_labels(Tensor):
+                The labels of the next sentence prediction task, the dimensionality of `next_sentence_labels`
+                is equal to `seq_relation_labels`. Its data type should be int64 and
+                its shape is [batch_size, 1]
+            masked_lm_scale(Tensor or int):
+                The scale of masked tokens. Used for the normalization of masked language modeling loss.
+                If it is a `Tensor`, its data type should be int64 and its shape is equal to `prediction_scores`.
+
+        Returns:
+            Tensor: The pretraining loss, equals to the sum of `masked_lm_loss` plus the mean of `next_sentence_loss`.
+            Its data type should be float32 and its shape is [1].
+
+        """
         with paddle.static.amp.fp16_guard():
             masked_lm_loss = F.cross_entropy(
                 prediction_scores,
diff --git a/paddlenlp/transformers/roformer/tokenizer.py b/paddlenlp/transformers/roformer/tokenizer.py
index 0d0111764bc59..09a041a329c37 100644
--- a/paddlenlp/transformers/roformer/tokenizer.py
+++ b/paddlenlp/transformers/roformer/tokenizer.py
@@ -24,12 +24,15 @@
 
 class JiebaBasicTokenizer(BasicTokenizer):
     """
-    Runs basic tokenization with jieba (punctuation splitting, lower casing, jieba pretokenizer etc.).
+    Runs basic tokenization with jieba (punctuation splitting, lower casing, jieba pretokenizer etc).
+
     Args:
-        do_lower_case (bool): Whether the text strips accents and convert to
-            lower case. If you use the RoFormer Pretrained model, lower is set to
-            Flase when using the cased model, otherwise it is set to True.
-            Default: True.
+        vocab (:class:`paddlenlp.data.Vocab`): An instance of paddlenlp.data.Vocab.
+        do_lower_case (bool):
+            Whether the text strips accents and converts to lower case.
+            If you use the RoFormer Pretrained model, lower is set to
+            False when using the cased model, otherwise it is set to True.
+            Defaults to `True`.
     """
 
     def __init__(self, vocab, do_lower_case=True):
@@ -61,27 +64,48 @@ class RoFormerTokenizer(PretrainedTokenizer):
     Constructs a RoFormer tokenizer. It uses a basic tokenizer to do punctuation
     splitting, lower casing, jieba pretokenizer and so on, and follows a WordPiece tokenizer to
     tokenize as subwords.
+
     Args:
-        vocab_file (str): file path of the vocabulary
-        do_lower_case (bool): Whether the text strips accents and convert to
-            lower case. If you use the RoFormer pretrained model, lower is set to
-            Flase when using the cased model, otherwise it is set to True.
-            Default: True.
-        use_jieba (bool): Whether or not to tokenize the text with jieba. Default: False.
-        unk_token (str): The special token for unkown words. Default: "[UNK]".
-        sep_token (str): The special token for separator token . Default: "[SEP]".
-        pad_token (str): The special token for padding. Default: "[PAD]".
-        cls_token (str): The special token for cls. Default: "[CLS]".
-        mask_token (str): The special token for mask. Default: "[MASK]".
+        vocab_file (str):
+            The vocabulary file path (ends with '.txt') required to instantiate
+            a `WordpieceTokenizer`.
+        do_lower_case (bool,optional):
+            Whether or not to lowercase the input when tokenizing.
+            If you use the RoFormer pretrained model, lower is set to
+            False when using the cased model, otherwise it is set to True.
+            Defaults to`True`.
+        use_jieba (bool,optional):
+            Whether or not to tokenize the text with jieba. Default: False.
+        unk_token (str,optional):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "[UNK]".
+        sep_token (str,optional):
+            A special token separating two different sentences in the same input.
+            Defaults to "[SEP]".
+        pad_token (str,optional):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to "[PAD]".
+        cls_token (str,optional):
+            A special token used for sequence classification. It is the last token
+            of the sequence when built with special tokens. Defaults to "[CLS]".
+        mask_token (str,optional):
+            A special token representing a masked token. This is the token used
+            in the masked language modeling task which the model tries to predict the original unmasked ones.
+            Defaults to "[MASK]".
 
     Examples:
-        .. code-block:: python
-            from paddlenlp.transformers.roformer import RoFormerTokenizer
+        .. code-block::
+
+            from paddlenlp.transformers import RoFormerTokenizer
             tokenizer = RoFormerTokenizer.from_pretrained('roformer-chinese-base')
-            # the following line get: ['今天', '的', '天气', '非常', '好', '！']
-            tokens = tokenizer.tokenize('今天的天气非常好！')
-            # the following line get: '今天 的 天气 非常 好 ！'
-            tokenizer.convert_tokens_to_string(tokens)
+
+            tokens = tokenizer('欢迎使用百度飞桨')
+            '''
+            {'input_ids': [101, 22355, 8994, 25854, 5438, 2473, 102],
+             'token_type_ids': [0, 0, 0, 0, 0, 0, 0]}
+            '''
+
     """
 
     resource_files_names = {"vocab_file": "vocab.txt"}  # for save_pretrained
@@ -185,9 +209,10 @@ def __init__(
     @property
     def vocab_size(self):
         """
-        return the size of vocabulary.
+        Return the size of vocabulary.
+
         Returns:
-            int: the size of vocabulary.
+            int: The size of vocabulary.
         """
         return len(self.vocab)
 
@@ -209,24 +234,47 @@ def _tokenize(self, text):
 
     def tokenize(self, text):
         """
-        End-to-end tokenization for RoFormer models.
+        Converts a string to a list of tokens.
+
         Args:
             text (str): The text to be tokenized.
 
         Returns:
-            list: A list of string representing converted tokens.
+            List(str): A list of string representing converted tokens.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.transformers import RoFormerTokenizer
+
+                tokenizer = RoFormerTokenizer.from_pretrained('roformer-chinese-base')
+                tokens = tokenizer.tokenize('欢迎使用百度飞桨')
+                #['欢迎', '使用', '百度', '飞', '桨']
+
         """
         return self._tokenize(text)
 
     def convert_tokens_to_string(self, tokens):
         """
-        Converts a sequence of tokens (list of string) in a single string. Since
-        the usage of WordPiece introducing `##` to concat subwords, also remove
-        `##` when converting.
+        Converts a sequence of tokens (list of string) in a single string.
+
         Args:
             tokens (list): A list of string representing tokens to be converted.
+
         Returns:
             str: Converted string from tokens.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.transformers import RoFormerTokenizer
+
+                tokenizer = RoFormerTokenizer.from_pretrained('roformer-chinese-base')
+                tokens = tokenizer.tokenize('欢迎使用百度飞桨')
+                #['欢迎', '使用', '百度', '飞', '桨']
+                strings = tokenizer.convert_tokens_to_string(tokens)
+                #'欢迎 使用 百度 飞 桨'
+
         """
         out_string = " ".join(tokens).replace(" ##", "").strip()
         return out_string
@@ -235,16 +283,13 @@ def num_special_tokens_to_add(self, pair=False):
         """
         Returns the number of added tokens when encoding a sequence with special tokens.
 
-        Note:
-            This encodes inputs and checks the number of added tokens, and is therefore not efficient. Do not put this
-            inside your training loop.
-
         Args:
-            pair: Returns the number of added tokens in the case of a sequence pair if set to True, returns the
-                number of added tokens in the case of a single sequence if set to False.
+            pair(bool):
+                Whether the input is a sequence pair or a single sequence.
+                Defaults to `False` and the input is a single sequence.
 
         Returns:
-            Number of tokens added to sequences
+            int: Number of tokens added to sequences.
         """
         token_ids_0 = []
         token_ids_1 = []
@@ -258,18 +303,18 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         adding special tokens.
 
         A Roformer sequence has the following format:
-        ::
-            - single sequence: ``[CLS] X [SEP]``
-            - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+
+        - single sequence:      ``[CLS] X [SEP]``
+        - pair of sequences:        ``[CLS] A [SEP] B [SEP]``
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (List[int]):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
-                Optional second list of IDs for sequence pairs.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
 
         Returns:
-            :obj:`List[int]`: List of input_id with the appropriate special tokens.
+            List[int]: List of input_id with the appropriate special tokens.
         """
         if token_ids_1 is None:
             return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -284,18 +329,18 @@ def build_offset_mapping_with_special_tokens(self,
         Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
 
         A RoFormer offset_mapping has the following format:
-        ::
-            - single sequence: ``(0,0) X (0,0)``
-            - pair of sequences: `(0,0) A (0,0) B (0,0)``
+
+        - single sequence: ``(0,0) X (0,0)``
+        - pair of sequences: `(0,0) A (0,0) B (0,0)``
 
         Args:
-            offset_mapping_ids_0 (:obj:`List[tuple]`):
-                List of char offsets to which the special tokens will be added.
-            offset_mapping_ids_1 (:obj:`List[tuple]`, `optional`):
-                Optional second list of char offsets for offset mapping pairs.
+            offset_mapping_ids_0 (List[tuple]):
+                List of wordpiece offsets to which the special tokens will be added.
+            offset_mapping_ids_1 (List[tuple], optional):
+                Optional second list of wordpiece offsets for offset mapping pairs. Defaults to None.
 
         Returns:
-            :obj:`List[tuple]`: List of char offsets with the appropriate offsets of special tokens.
+            List[tuple]: List of wordpiece offsets with the appropriate offsets of special tokens.
         """
         if offset_mapping_1 is None:
             return [(0, 0)] + offset_mapping_0 + [(0, 0)]
@@ -318,13 +363,13 @@ def create_token_type_ids_from_sequences(self,
         If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
-                Optional second list of IDs for sequence pairs.
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
 
         Returns:
-            :obj:`List[int]`: List of token_type_id according to the given sequence(s).
+            List[int]: List of token_type_id according to the given sequence(s).
         """
         _sep = [self.sep_token_id]
         _cls = [self.cls_token_id]
@@ -342,13 +387,15 @@ def get_special_tokens_mask(self,
         special tokens using the tokenizer ``encode`` methods.
 
         Args:
-            token_ids_0 (List[int]): List of ids of the first sequence.
-            token_ids_1 (List[int], optinal): List of ids of the second sequence.
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
             already_has_special_tokens (bool, optional): Whether or not the token list is already
                 formatted with special tokens for the model. Defaults to None.
 
         Returns:
-            results (List[int]): The list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            List[int]: The list of integers either be 0 or 1: 1 for a special token, 0 for a sequence token.
         """
 
         if already_has_special_tokens:
diff --git a/paddlenlp/transformers/transformer/modeling.py b/paddlenlp/transformers/transformer/modeling.py
index 881bafcd3716e..c1fd8531689be 100644
--- a/paddlenlp/transformers/transformer/modeling.py
+++ b/paddlenlp/transformers/transformer/modeling.py
@@ -232,12 +232,12 @@ def forward(self, predict, label):
                     The average loss of current batch whose data type can be float32, float64.
                     The relation between `sum_cost` and `avg_cost` can be described as:
 
-                    .. math:
+                    .. math::
 
-                        avg_cost = sum_cost / token_num
+                        avg\_cost = sum\_cost / token\_num
 
                 - `token_num` (Tensor):
-                    The number of tokens of current batch. 
+                    The number of tokens of current batch. Its data type can be float32, float64.
 
         Example:
             .. code-block::
@@ -493,6 +493,31 @@ def tile_beam_merge_with_batch(t, beam_size):
             t)
 
     def step(self, time, inputs, states, **kwargs):
+        """
+        Perform a beam search decoding step, which uses cell to get probabilities,
+        and follows a beam search step to calculate scores and select candidate token ids.
+
+        Args:
+             time(Tensor): An `int64` tensor with shape `[1]` provided by the caller,
+                 representing the current time step number of decoding.
+             inputs(Tensor): A tensor variable. It is same as `initial_inputs`
+                 returned by `initialize()` for the first decoding step and
+                 `next_inputs` returned by `step()` for the others.
+             states(Tensor): A structure of tensor variables.
+                 It is same as the `initial_cell_states` returned by `initialize()`
+                 for the first decoding step and `next_states` returned by
+                 `step()` for the others.
+             kwargs(dict, optional): Additional keyword arguments, provided by the caller `dynamic_decode`.
+
+        Returns:
+             tuple: Returns tuple (``beam_search_output, beam_search_state, next_inputs, finished``).
+             `beam_search_state` and `next_inputs` have the same structure,
+             shape and data type as the input arguments states and inputs separately.
+             `beam_search_output` is a namedtuple(including scores, predicted_ids, parent_ids as fields) of tensor variables,
+             where `scores, predicted_ids, parent_ids` all has a tensor value shaped [batch_size, beam_size] with data type
+             float32, int64, int64. `finished` is a bool tensor with shape [batch_size, beam_size].
+
+         """
         # Steps for decoding.
         # Compared to RNN, Transformer has 3D data at every decoding step
         inputs = paddle.reshape(inputs, [-1, 1])  # token
@@ -628,7 +653,7 @@ class TransformerModel(nn.Layer):
             The dropout probability used in MHA to drop some attention target.
             If None, use the value of dropout. Defaults to None.
         act_dropout (float):
-            The dropout probability used after FFN activition. If None, use
+            The dropout probability used after FFN activation. If None, use
             the value of dropout. Defaults to None.
         bos_id (int, optional):
             The start token id and also be used as padding id. Defaults to 0.
diff --git a/paddlenlp/transformers/unified_transformer/modeling.py b/paddlenlp/transformers/unified_transformer/modeling.py
index 286a0b2dd76a0..4505c4e7b8e11 100644
--- a/paddlenlp/transformers/unified_transformer/modeling.py
+++ b/paddlenlp/transformers/unified_transformer/modeling.py
@@ -195,7 +195,7 @@ class UnifiedTransformerModel(UnifiedTransformerPretrainedModel):
             Defaults to 0.1.
         normalize_before (bool, optional): 
             Indicate whether to put layer normalization into preprocessing of 
-            MHA and FFN sub-layers. If True, pre-process is layer ormalization 
+            MHA and FFN sub-layers. If True, pre-process is layer normalization 
             and post-precess includes dropout, residual connection. Otherwise, 
             no pre-process and post-precess includes dropout, residual 
             connection, layer normalization. Defaults to True.
diff --git a/paddlenlp/transformers/unified_transformer/tokenizer.py b/paddlenlp/transformers/unified_transformer/tokenizer.py
index 428f0b284fe2a..9b0334d560e5a 100644
--- a/paddlenlp/transformers/unified_transformer/tokenizer.py
+++ b/paddlenlp/transformers/unified_transformer/tokenizer.py
@@ -48,7 +48,7 @@ class UnifiedTransformerTokenizer(PretrainedTokenizer):
         vocab_file (str):
             The path of file to construct vocabulary.
         sentencepiece_model_file (str):
-            The sentencepiece model file required to instantiate a 
+            The sentencepiece model file (ends with '.spm') required to instantiate a
             `SentencePiece <https://github.com/google/sentencepiece>`__.
         do_lower_case (bool, optional):
             Whether or not to lowercase the input when tokenizing. Defaults to 
@@ -246,18 +246,18 @@ def _tokenize(self, text, is_split_into_words=True):
 
     def tokenize(self, text, is_split_into_words=True):
         """
-        End-to-end tokenization for UnifiedTransformer models.
+        Converts a string to a list of tokens.
 
         Args:
             text (str): 
                 The text to be tokenized.
-            is_split_into_words (bool, optinal): 
+            is_split_into_words (bool, optional):
                 Whether or not the input `text` has been pretokenized. If False, 
                 the input `text` will be pretokenized by `jieba` firstly. 
                 Defaults to True.
         
         Returns:
-            list[str]: A list of string representing converted tokens.
+            List(str): A list of string representing converted tokens.
 
         Example:
             .. code-block::
@@ -265,8 +265,8 @@ def tokenize(self, text, is_split_into_words=True):
                 from paddlenlp.transformers import UnifiedTransformerTokenizer
 
                 tokenizer = UnifiedTransformerTokenizer.from_pretrained('plato-mini')
-                print(tokenizer.tokenize('我爱祖国', is_split_into_words=False))
-                # ['▁我', '▁爱', '祖', '国']
+                print(tokenizer.tokenize('欢迎使用百度飞桨！', is_split_into_words=False))
+                # ['▁欢迎', '▁使用', '▁百度', '▁飞', '桨', '▁!']
         """
         return self._tokenize(text, is_split_into_words=is_split_into_words)
 
@@ -307,10 +307,10 @@ def convert_tokens_to_string(self, tokens, keep_space=True):
                 from paddlenlp.transformers import UnifiedTransformerTokenizer
 
                 tokenizer = UnifiedTransformerTokenizer.from_pretrained('plato-mini')
-                print(tokenizer.convert_tokens_to_string(['▁我', '▁爱', '祖', '国']))
-                # 我 爱祖国
-                print(tokenizer.convert_tokens_to_string(['▁我', '▁爱', '祖', '国'], keep_space=False))
-                # 我爱祖国
+                print(tokenizer.convert_tokens_to_string(['▁欢迎', '▁使用', '▁百度', '▁飞', '桨', '▁!']))
+                # 欢迎 使用 百度 飞桨 !
+                print(tokenizer.convert_tokens_to_string(['▁欢迎', '▁使用', '▁百度', '▁飞', '桨', '▁!'], keep_space=False))
+                # 欢迎使用百度飞桨!
         """
         tokens = self.merge_subword(tokens)
         if keep_space:
@@ -342,7 +342,7 @@ def convert_ids_to_string(self, ids, keep_space=True):
                 from paddlenlp.transformers import UnifiedTransformerTokenizer
 
                 tokenizer = UnifiedTransformerTokenizer.from_pretrained('plato-mini')
-                tokens = tokenizer.tokenize('我爱祖国', is_split_into_words=False)
+                tokens = tokenizer.tokenize('欢迎使用百度飞桨！', is_split_into_words=False)
                 ids = tokenizer.convert_tokens_to_ids(tokens)
                 print(ids)
                 # [6, 121, 26907, 25475]