diff --git a/paddlenlp/embeddings/token_embedding.py b/paddlenlp/embeddings/token_embedding.py
index c1caef1792951c..8e0fe425d95a84 100644
--- a/paddlenlp/embeddings/token_embedding.py
+++ b/paddlenlp/embeddings/token_embedding.py
@@ -34,7 +34,7 @@
 
 def list_embedding_name():
     """
-    List all names of pretrained embedding models paddlenlp provides.
+    Lists all names of pretrained embedding models paddlenlp provides.
     """
     return list(EMBEDDING_NAME_LIST)
 
@@ -46,20 +46,26 @@ class TokenEmbedding(nn.Embedding):
     by specifying extended_vocab_path.
 
     Args:
-        embedding_name (object: `str`, optional, default to `w2v.baidu_encyclopedia.target.word-word.dim300`):
+        embedding_name (`str`, optional):
             The pre-trained embedding model name. Use `paddlenlp.embeddings.list_embedding_name()` to
-            show which embedding model we have alreaady provide.
-        unknown_token (object: `str`, optional, default to `[UNK]`):
-            Specifying unknown token as unknown_token.
-        unknown_token_vector (object: list, optional, default to `None`):
+            list the names of all embedding models that we provide. 
+            Defaults to `w2v.baidu_encyclopedia.target.word-word.dim300`.
+        unknown_token (`str`, optional):
+            Specifies unknown token.
+            Defaults to `[UNK]`.
+        unknown_token_vector (`list`, optional):
             To initialize the vector of unknown token. If it's none, use normal distribution to
             initialize the vector of unknown token.
-        extended_vocab_path (object: `str`, optional, default to `None`):
+            Defaults to `None`.
+        extended_vocab_path (`str`, optional):
             The file path of extended vocabulary.
-        trainable (object: `bool`, optional, default to True):
+            Defaults to `None`.
+        trainable (`bool`, optional):
             Whether the weight of embedding can be trained.
-        keep_extended_vocab_only (object: `bool`, optional, default to True):
-            Whether keep the extended vocabulary only, will be effective only if provides extended_vocab_path
+            Defaults to True.
+        keep_extended_vocab_only (`bool`, optional):
+            Whether to keep the extended vocabulary only, will be effective only if provides extended_vocab_path.
+            Defaults to False.
     """
 
     def __init__(self,
@@ -120,7 +126,7 @@ def __init__(self,
 
     def _init_without_extend_vocab(self, vector_np, pad_vector, unk_vector):
         """
-        Construct index to word list, word to index dict and embedding weight.
+        Constructs index to word list, word to index dict and embedding weight.
         """
         self._idx_to_word = list(vector_np['vocab'])
         self._idx_to_word.append(self.unknown_token)
@@ -144,7 +150,7 @@ def _read_vocab_list_from_file(self, extended_vocab_path):
     def _extend_vocab(self, extended_vocab_path, vector_np, pad_vector,
                       unk_vector, keep_extended_vocab_only):
         """
-        Construct index to word list, word to index dict and embedding weight using
+        Constructs index to word list, word to index dict and embedding weight using
         extended vocab.
         """
         logger.info("Start extending vocab.")
@@ -217,20 +223,25 @@ def _extend_vocab(self, extended_vocab_path, vector_np, pad_vector,
 
     def set_trainable(self, trainable):
         """
-        Set the weight of embedding can be trained.
+        Whether or not to set the weights of token embedding to be trainable.
+
         Args:
-            trainable (object: `bool`, required):
-                Whether the weight of embedding can be trained.
+            trainable (`bool`):
+                The weights can be trained if trainable is set to True, or the weights are fixed if trainable is False.
+
         """
         self.weight.stop_gradient = not trainable
 
     def search(self, words):
         """
-        Get the vectors of specifying words.
+        Gets the vectors of specifying words.
+
         Args:
-            words (object: `list` or `str` or `int`, required): The words which need to be searched.
+            words (`list` or `str` or `int`): The words which need to be searched.
+
         Returns:
-            word_vector (object: `numpy.array`): The vectors of specifying words.
+            `numpy.array`: The vectors of specifying words.
+
         """
         idx_list = self.get_idx_list_from_words(words)
         idx_tensor = paddle.to_tensor(idx_list)
@@ -238,14 +249,28 @@ def search(self, words):
 
     def get_idx_from_word(self, word):
         """
-        Get the index of specifying word by searching word_to_idx dict. 
+        Gets the index of specifying word by searching word_to_idx dict. 
+
+        Args:
+            word (`list` or `str` or `int`): The input token word which we want to get the token index converted from.
+
+        Returns:
+            `int`: The index of specifying word.
+
         """
         return get_idx_from_word(word, self.vocab.token_to_idx,
                                  self.unknown_token)
 
     def get_idx_list_from_words(self, words):
         """
-        Get the index list of specifying words by searching word_to_idx dict. 
+        Gets the index list of specifying words by searching word_to_idx dict.
+
+        Args:
+            words (`list` or `str` or `int`): The input token words which we want to get the token indices converted from.
+
+        Returns:
+            `list`: The indexes list of specifying words.
+
         """
         if isinstance(words, str):
             idx_list = [self.get_idx_from_word(words)]
@@ -271,24 +296,33 @@ def _calc_word(self, word_a, word_b, calc_kernel):
 
     def dot(self, word_a, word_b):
         """
-        Calculate the scalar product of 2 words.
+        Calculates the dot product of 2 words. Dot product or scalar product is an
+        algebraic operation that takes two equal-length sequences of numbers (usually
+        coordinate vectors), and returns a single number.
+
         Args:
-            word_a (object: `str`, required): The first word string.
-            word_b (object: `str`, required): The second word string.
+            word_a (`str`): The first word string.
+            word_b (`str`): The second word string.
+
         Returns:
-            The scalar product of 2 words.
+            `Float`: The dot product of 2 words.
+
         """
         dot = self._dot_np
         return self._calc_word(word_a, word_b, lambda x, y: dot(x, y))
 
     def cosine_sim(self, word_a, word_b):
         """
-        Calculate the cosine similarity of 2 words.
+        Calculates the cosine similarity of 2 word vectors. Cosine similarity is the
+        cosine of the angle between two n-dimensional vectors in an n-dimensional space.
+
         Args:
-            word_a (object: `str`, required): The first word string.
-            word_b (object: `str`, required): The second word string.
+            word_a (`str`): The first word string.
+            word_b (`str`): The second word string.
+
         Returns:
-            The cosine similarity of 2 words.
+            `Float`: The cosine similarity of 2 words.
+
         """
         dot = self._dot_np
         return self._calc_word(
@@ -297,11 +331,14 @@ def cosine_sim(self, word_a, word_b):
 
     def _construct_word_to_idx(self, idx_to_word):
         """
-        Construct word to index dict.
+        Constructs word to index dict.
+
         Args:
-            idx_to_word (object: 'list', required): 
+            idx_to_word ('list'):
+
         Returns:
-            word_to_idx (object: `dict`): The word to index dict constructed by idx_to_word.
+            `Dict`: The word to index dict constructed by idx_to_word.
+
         """
         word_to_idx = {}
         for i, word in enumerate(idx_to_word):
@@ -311,7 +348,8 @@ def _construct_word_to_idx(self, idx_to_word):
     def __repr__(self):
         """
         Returns:
-            info (object: `str`): The token embedding infomation.
+            `Str`: The token embedding infomation.
+
         """
         info = "Object   type: {}\
              \nUnknown index: {}\
diff --git a/paddlenlp/transformers/bigbird/modeling.py b/paddlenlp/transformers/bigbird/modeling.py
index 863c9737709366..2b82ddf2c601da 100644
--- a/paddlenlp/transformers/bigbird/modeling.py
+++ b/paddlenlp/transformers/bigbird/modeling.py
@@ -205,7 +205,8 @@ class BigBirdPretrainedModel(PretrainedModel):
     An abstract class for pretrained BigBird models. It provides BigBird related
     `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
     `pretrained_init_configuration`, `base_model_prefix` for downloading and
-    loading pretrained models. See `PretrainedModel` for more details.
+    loading pretrained models. 
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
     """
 
     model_config_file = "model_config.json"
@@ -242,7 +243,7 @@ class BigBirdPretrainedModel(PretrainedModel):
     base_model_prefix = "bigbird"
 
     def init_weights(self, layer):
-        """ Initialization hook """
+        # Initialization hook
         if isinstance(layer, (nn.Linear, nn.Embedding)):
             # In the dygraph mode, use the `set_value` to reset the parameter directly,
             # and reset the `state_dict` to update parameter in static mode.
@@ -260,6 +261,73 @@ def init_weights(self, layer):
 
 @register_base_model
 class BigBirdModel(BigBirdPretrainedModel):
+    """
+    The bare BigBird Model transformer outputting raw hidden-states without any specific head on top.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Check the superclass documentation for the generic methods and the library implements for all its model.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/en/api/paddle/fluid/dygraph/layers/Layer_en.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        num_layers (`int`):
+            Number of hidden layers in the Transformer encoder.
+        vocab_size (`int`):
+            Vocabulary size of the BigBird model. Defines the number of different tokens that can
+            be represented by the `inputs_ids` passed when calling BigBirdModel.
+        nhead (`int`):
+            Number of heads in attention part.
+        attn_dropout (`float`, optional):
+            The dropout probability for all attention layers.
+            Defaults to ``0.1``.
+        dim_feedforward (`int`, optional):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+            Defaults to ``3072``.
+        activation (`str`, optional):
+            The non-linear activation function in the feed-forward layer.
+            ``"gelu"``, ``"relu"``, ``"silu"`` and ``"gelu_new"`` are supported.
+            Defaults to ``"gelu"``.
+        normalize_before (`bool`, optional):
+            Indicates whether to put layer normalization into preprocessing of MHA and FFN sub-layers.
+            If True, pre-process is layer normalization and post-precess includes dropout,
+            residual connection. Otherwise, no pre-process and post-precess includes dropout,
+            residual connection, layer normalization. 
+            Defaults to ``False``.
+        block_size (`int`, optional):
+            The block size for the attention mask. 
+            Defaults to ``1``.
+        window_size (`int`, optional):
+            The number of block in a window. 
+            Defaults to ``3``.
+        num_global_blocks (`int`, optional):
+            Number of global blocks per sequence.
+            Defaults to ``1``.
+        num_rand_blocks (`int`, optional):
+            Number of random blocks per row.
+            Defaults to ``2``.
+        seed (`int`, `None`, optional):
+            The random seed for generating random block id.
+            Defaults to ``None``.
+        pad_token_id (`int`, optional):
+            The index of padding token for BigBird embedding.
+            Defaults to ``0``.
+        hidden_size (`int`, optional):
+            Dimensionality of the encoder layers and the pooler layer.
+            Defaults to ``768``.
+        hidden_dropout_prob (`float`, optional):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+            Defaults to ``0.1``.
+        max_position_embeddings (`int`, optional):
+            The size position embeddings of matrix, which dictates the maximum length
+            for which the model can be run.
+            Defaults to ``512``.
+        type_vocab_size (`int`, optional):
+            The vocabulary size of the `token_type_ids`. 
+            Defaults to ``2``.
+    """
+
     def __init__(self,
                  num_layers,
                  vocab_size,
@@ -322,6 +390,77 @@ def forward(self,
                 token_type_ids=None,
                 attention_mask_list=None,
                 rand_mask_idx_list=None):
+        r"""
+        The BigBirdModel forward method, overrides the __call__() special method.
+        
+        Args:
+            input_ids (`Tensor`):
+                Indices of input sequence tokens in the vocabulary.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            token_type_ids (`Tensor`, optional):
+                Segment token indices to indicate first and second portions of the inputs.
+                Indices can either be 0 or 1:
+
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to ``None``, which means we don't add segment embeddings.
+            attention_mask_list (`list`, optional):
+                A list which contains some tensors used in multi-head attention
+                to prevents attention to some unwanted positions, usually the
+                paddings or the subsequent positions. The tensors' shape will be
+                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`
+            rand_mask_idx_list (`list`, optional):
+                A list which contains some tensors used in bigbird random block.
+
+        Returns:
+            `Tuple`: (``encoder_output``, ``pooled_output``).
+            
+            With the fields:
+
+            - encoder_output (`Tensor`):
+                Sequence of output at the last layer of the model. Its data type should be float32 and
+                has a shape of [batch_size, sequence_length, hidden_size].
+
+            - pooled_output (`Tensor`):
+                The output of first token (`[CLS]`) in sequence. Its data type should be float32 and
+                has a shape of [batch_size, hidden_size].
+
+        Examples:
+            .. code-block::
+                
+                import paddle
+                from paddlenlp.transformers import BigBirdModel, BigBirdTokenizer
+                from paddlenlp.transformers import create_bigbird_rand_mask_idx_list
+
+                tokenizer = BigBirdTokenizer.from_pretrained('bigbird-base-uncased')
+                model = BigBirdModel.from_pretrained('bigbird-base-uncased')
+                config = model.config
+                max_seq_len = 512
+                input_ids = tokenizer.convert_tokens_to_ids(
+                    tokenizer(
+                        "This is a docudrama story on the Lindy Chamberlain case and a look at "
+                        "its impact on Australian society It especially looks at the problem of "
+                        "innuendo gossip and expectation when dealing with reallife dramasbr br "
+                        "One issue the story deals with is the way it is expected people will all "
+                        "give the same emotional response to similar situations Not everyone goes "
+                        "into wild melodramatic hysterics to every major crisis Just because the "
+                        "characters in the movies and on TV act in a certain way is no reason to "
+                        "expect real people to do so"
+                    ))
+                input_ids.extend([0] * (max_seq_len - len(input_ids)))
+                seq_len = len(input_ids)
+                input_ids = paddle.to_tensor([input_ids])
+                rand_mask_idx_list = create_bigbird_rand_mask_idx_list(
+                    config["num_layers"], seq_len, seq_len, config["nhead"],
+                    config["block_size"], config["window_size"], config["num_global_blocks"],
+                    config["num_rand_blocks"], config["seed"])
+                rand_mask_idx_list = [
+                    paddle.to_tensor(rand_mask_idx) for rand_mask_idx in rand_mask_idx_list
+                ]
+                output = model(input_ids, rand_mask_idx_list=rand_mask_idx_list)
+        """
         embedding_output = self.embeddings(input_ids, token_type_ids)
         attention_mask_list, query_mask, key_mask = self._process_mask(
             input_ids, attention_mask_list)
@@ -332,6 +471,17 @@ def forward(self,
 
 
 class BigBirdForSequenceClassification(BigBirdPretrainedModel):
+    """
+    BigBird Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g.
+    for GLUE tasks.
+
+    Args:
+        bigbird (:class:`BigBirdModel`):
+            An instance of :class:`BigBirdModel`.
+        num_classes (`int`, optional):
+            The number of classes. Defaults to ``None``.
+    """
+
     def __init__(self, bigbird, num_classes=None):
         super(BigBirdForSequenceClassification, self).__init__()
         self.bigbird = bigbird
@@ -347,6 +497,57 @@ def forward(self,
                 token_type_ids=None,
                 attention_mask_list=None,
                 rand_mask_idx_list=None):
+        r"""
+        The BigBirdForSequenceClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (`Tensor`):
+                See :class:`BigBirdModel`.
+            token_type_ids (`Tensor`):
+                See :class:`BigBirdModel`.
+            attention_mask_list (`List`):
+                See :class:`BigBirdModel`.
+            rand_mask_idx_list (`List`):
+                See :class:`BigBirdModel`.
+
+        Returns:
+            `Tensor`: Probability of each class. Its data type should be float32 and it has a shape of [batch_size, num_classes].
+
+        Examples:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import BigBirdForSequenceClassification, BigBirdTokenizer
+                from paddlenlp.transformers import create_bigbird_rand_mask_idx_list
+
+                tokenizer = BigBirdTokenizer.from_pretrained('bigbird-base-uncased')
+                model = BigBirdForSequenceClassification.from_pretrained('bigbird-base-uncased')
+                config = model.bigbird.config
+                max_seq_len = 512
+                input_ids = tokenizer.convert_tokens_to_ids(
+                    tokenizer(
+                        "This is a docudrama story on the Lindy Chamberlain case and a look at "
+                        "its impact on Australian society It especially looks at the problem of "
+                        "innuendo gossip and expectation when dealing with reallife dramasbr br "
+                        "One issue the story deals with is the way it is expected people will all "
+                        "give the same emotional response to similar situations Not everyone goes "
+                        "into wild melodramatic hysterics to every major crisis Just because the "
+                        "characters in the movies and on TV act in a certain way is no reason to "
+                        "expect real people to do so"
+                    ))
+                input_ids.extend([0] * (max_seq_len - len(input_ids)))
+                seq_len = len(input_ids)
+                input_ids = paddle.to_tensor([input_ids])
+                rand_mask_idx_list = create_bigbird_rand_mask_idx_list(
+                    config["num_layers"], seq_len, seq_len, config["nhead"],
+                    config["block_size"], config["window_size"], config["num_global_blocks"],
+                    config["num_rand_blocks"], config["seed"])
+                rand_mask_idx_list = [
+                    paddle.to_tensor(rand_mask_idx) for rand_mask_idx in rand_mask_idx_list
+                ]
+                output = model(input_ids, rand_mask_idx_list=rand_mask_idx_list)
+                print(output)
+        """
         _, pooled_output = self.bigbird(
             input_ids,
             token_type_ids,
@@ -391,6 +592,23 @@ def forward(self, hidden_states, masked_positions=None):
 
 
 class BigBirdPretrainingHeads(Layer):
+    """
+    The BigBird pretraining heads for a pretraiing task on top.
+
+    Args:
+        hidden_size (`int`):
+            See :class:`BigBirdModel`.
+        vocab_size (`int`):
+            See :class:`BigBirdModel`.
+        activation (`str`):
+            See :class:`BigBirdModel`.
+        embedding_weights (`Tensor`, optional):
+            The weight of pretraining embedding layer. Its data type should be float32
+            and its shape is [hidden_size, vocab_size].
+            If set to `None`, use normal distribution to initialize weight.
+            Defaults to `None`.
+    """
+
     def __init__(self,
                  hidden_size,
                  vocab_size,
@@ -402,12 +620,46 @@ def __init__(self,
         self.seq_relationship = nn.Linear(hidden_size, 2)
 
     def forward(self, sequence_output, pooled_output, masked_positions=None):
+        r"""
+        The BigBirdPretrainingHeads forward method, overrides the __call__() special method.
+
+        Args:
+            sequence_output (`Tensor`):
+                The sequence output of BigBirdModel. Its data type should be float32 and
+                has a shape of [batch_size, sequence_length, hidden_size].
+            pooled_output (`Tensor`):
+                The pooled output of BigBirdModel. Its data type should be float32 and
+                has a shape of [batch_size, hidden_size].
+            masked_positions (`Tensor`):
+                The list of masked positions. Its data type should be int64
+                and has a shape of [mask_token_num, hidden_size]. Defaults to `None`.
+        Returns:
+            `Tuple`: (``prediction_scores``, ``seq_relationship_score``).
+            
+            With the fields:
+
+            - prediction_scores (`Tensor`):
+                The prediction score of masked tokens. Its data type should be float32 and
+                has a shape of [batch_size, sequence_length, vocab_size].
+            - seq_relationship_score (`Tensor`):
+                The logits whether 2 sequences are NSP relationship. Its data type should be float32 and
+                has a shape of [batch_size, 2].
+        """
         prediction_scores = self.predictions(sequence_output, masked_positions)
         seq_relationship_score = self.seq_relationship(pooled_output)
         return prediction_scores, seq_relationship_score
 
 
 class BigBirdForPretraining(BigBirdPretrainedModel):
+    """
+    BigBird Model for a pretraiing task on top.
+
+    Args:
+        bigbird (:class:`BigBirdModel`):
+            An instance of :class:`BigBirdModel`.
+
+    """
+
     def __init__(self, bigbird):
         super(BigBirdForPretraining, self).__init__()
         self.bigbird = bigbird
@@ -425,6 +677,66 @@ def forward(self,
                 position_ids=None,
                 rand_mask_idx_list=None,
                 masked_positions=None):
+        r"""
+        The BigBirdForPretraining forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (`Tensor`):
+                See :class:`BigBirdModel`.
+            token_type_ids (`Tensor`):
+                See :class:`BigBirdModel`.
+            attention_mask_list (`List`):
+                See :class:`BigBirdModel`.
+            rand_mask_idx_list (`List`):
+                See :class:`BigBirdModel`.
+            masked_positions (`List`):
+                The list of masked positions.
+
+        Returns:
+            `Tuple`: (``prediction_scores``, ``seq_relationship_score``).
+            
+            With the fields:
+
+            - prediction_scores (`Tensor`):
+                The prediction score of masked tokens. Its data type should be float32 and
+                has a shape of [batch_size, sequence_length, vocab_size].
+            - seq_relationship_score (`Tensor`):
+                The logits whether 2 sequences are NSP relationship. Its data type should be float32 and
+                has a shape of [batch_size, 2].
+
+        Examples:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import BigBirdForPretraining, BigBirdTokenizer
+                from paddlenlp.transformers import create_bigbird_rand_mask_idx_list
+
+                tokenizer = BigBirdTokenizer.from_pretrained('bigbird-base-uncased')
+                model = BigBirdForPretraining.from_pretrained('bigbird-base-uncased')
+                config = model.bigbird.config
+                max_seq_len = 512
+                input_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights = tokenizer.encode(
+                        "This is a docudrama story on the Lindy Chamberlain case and a look at "
+                        "its impact on Australian society It especially looks at the problem of "
+                        "innuendo gossip and expectation when dealing with reallife dramasbr br "
+                        "One issue the story deals with is the way it is expected people will all "
+                        "give the same emotional response to similar situations Not everyone goes "
+                        "into wild melodramatic hysterics to every major crisis Just because the "
+                        "characters in the movies and on TV act in a certain way is no reason to "
+                        "expect real people to do so", max_seq_len=max_seq_len)
+
+                seq_len = len(input_ids)
+                input_ids = paddle.to_tensor([input_ids])
+                rand_mask_idx_list = create_bigbird_rand_mask_idx_list(
+                    config["num_layers"], seq_len, seq_len, config["nhead"],
+                    config["block_size"], config["window_size"], config["num_global_blocks"],
+                    config["num_rand_blocks"], config["seed"])
+                rand_mask_idx_list = [
+                    paddle.to_tensor(rand_mask_idx) for rand_mask_idx in rand_mask_idx_list
+                ]
+                output = model(input_ids, rand_mask_idx_list=rand_mask_idx_list)
+                print(output)
+        """
         outputs = self.bigbird(
             input_ids,
             token_type_ids=token_type_ids,
@@ -437,6 +749,22 @@ def forward(self,
 
 
 class BigBirdPretrainingCriterion(paddle.nn.Layer):
+    """
+    BigBird Criterion for a pretraiing task on top.
+
+    Args:
+        vocab_size (`int`):
+            See :class:`BigBirdModel`.
+        use_nsp (`bool`, optional):
+            It decides whether it considers NSP loss.
+            Defaults: False
+        ignore_index (`int`):
+            Specifies a target value that is ignored and does
+            not contribute to the input gradient. Only valid
+            if :attr:`soft_label` is set to :attr:`False`.
+            Defaults to `0`. 
+    """
+
     def __init__(self, vocab_size, use_nsp=False, ignore_index=0):
         super(BigBirdPretrainingCriterion, self).__init__()
         # CrossEntropyLoss is expensive since the inner reshape (copy)
@@ -448,6 +776,92 @@ def __init__(self, vocab_size, use_nsp=False, ignore_index=0):
     def forward(self, prediction_scores, seq_relationship_score,
                 masked_lm_labels, next_sentence_labels, masked_lm_scale,
                 masked_lm_weights):
+        r"""
+        The BigBirdPretrainingCriterion forward method, overrides the __call__() special method.
+
+        Args:
+            prediction_scores (`Tensor`):
+                The logits of masked token prediction. Its data type should be float32 and
+                its shape is [batch_size, sequence_len, vocab_size]
+            seq_relationship_score (`Tensor`):
+                The logits whether 2 sequences are NSP relationship. Its data type should be float32 and
+                its shape is [batch_size, 2]
+            masked_lm_labels (`Tensor`):
+                The masked token labels. Its data type should be int64 and
+                its shape is [mask_token_num, 1]
+            next_sentence_labels (`Tensor`):
+                The labels of NSP tasks.  Its data type should be int64 and
+                its shape is [batch_size, 1]
+            masked_lm_scale (`Tensor` or `int`):
+                The scale of masked tokens. If it is a `Tensor`, its data type should be int64 and
+                its shape is [mask_token_num, 1]
+            masked_lm_weights (`Tensor`):
+                The weight of masked tokens. Its data type should be float32 and its shape
+                is [mask_token_num, 1]
+
+        Returns:
+            `Float`: The pretraining loss.
+
+        Example:
+            .. code-block::
+                import numpy as np
+                import paddle
+                from paddlenlp.transformers import BigBirdForPretraining, BigBirdTokenizer, BigBirdPretrainingCriterion
+                from paddlenlp.transformers import create_bigbird_rand_mask_idx_list
+
+                tokenizer = BigBirdTokenizer.from_pretrained('bigbird-base-uncased')
+                model = BigBirdForPretraining.from_pretrained('bigbird-base-uncased')
+                config = model.bigbird.config
+                criterion = BigBirdPretrainingCriterion(config["vocab_size"], False)
+                max_seq_len = 512
+                max_pred_length=75
+                input_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights = tokenizer.encode(
+                        "This is a docudrama story on the Lindy Chamberlain case and a look at "
+                        "its impact on Australian society It especially looks at the problem of "
+                        "innuendo gossip and expectation when dealing with reallife dramasbr br "
+                        "One issue the story deals with is the way it is expected people will all "
+                        "give the same emotional response to similar situations Not everyone goes "
+                        "into wild melodramatic hysterics to every major crisis Just because the "
+                        "characters in the movies and on TV act in a certain way is no reason to "
+                        "expect real people to do so", max_seq_len=max_seq_len, max_pred_len=max_pred_length)
+
+                seq_len = len(input_ids)
+                masked_lm_positions_tmp = np.full(seq_len, 0, dtype=np.int32)
+                masked_lm_ids_tmp = np.full([seq_len, 1], -1, dtype=np.int64)
+                masked_lm_weights_tmp = np.full([seq_len], 0, dtype="float32")
+
+                mask_token_num = 0
+                for i, x in enumerate([input_ids]):
+                for j, pos in enumerate(masked_lm_positions):
+                    masked_lm_positions_tmp[mask_token_num] = i * seq_len + pos
+                    masked_lm_ids_tmp[mask_token_num] = masked_lm_ids[j]
+                    masked_lm_weights_tmp[mask_token_num] = masked_lm_weights[j]
+
+                masked_lm_positions = masked_lm_positions_tmp
+                masked_lm_ids = masked_lm_ids_tmp
+                masked_lm_weights = masked_lm_weights_tmp
+                print(masked_lm_ids.shape)
+                input_ids = paddle.to_tensor([input_ids])
+                masked_lm_positions = paddle.to_tensor(masked_lm_positions)
+                masked_lm_ids = paddle.to_tensor(masked_lm_ids, dtype='int64')
+                masked_lm_weights = paddle.to_tensor(masked_lm_weights)
+                masked_lm_scale = 1.0
+                next_sentence_labels = paddle.zeros(shape=(1, 1), dtype='int64')
+
+                rand_mask_idx_list = create_bigbird_rand_mask_idx_list(
+                    config["num_layers"], seq_len, seq_len, config["nhead"],
+                    config["block_size"], config["window_size"], config["num_global_blocks"],
+                    config["num_rand_blocks"], config["seed"])
+                rand_mask_idx_list = [
+                    paddle.to_tensor(rand_mask_idx) for rand_mask_idx in rand_mask_idx_list
+                ]
+                prediction_scores, seq_relationship_score = model(input_ids, rand_mask_idx_list=rand_mask_idx_list, masked_positions=masked_lm_positions)
+
+                loss = criterion(prediction_scores, seq_relationship_score,
+                                masked_lm_ids, next_sentence_labels,
+                                masked_lm_scale, masked_lm_weights)
+                print(loss)
+        """
         masked_lm_loss = paddle.nn.functional.softmax_with_cross_entropy(
             prediction_scores, masked_lm_labels, ignore_index=self.ignore_index)
         masked_lm_loss = paddle.transpose(masked_lm_loss, [1, 0])
diff --git a/paddlenlp/transformers/bigbird/tokenizer.py b/paddlenlp/transformers/bigbird/tokenizer.py
index 808da96646536e..d143ccadf85fdd 100644
--- a/paddlenlp/transformers/bigbird/tokenizer.py
+++ b/paddlenlp/transformers/bigbird/tokenizer.py
@@ -29,19 +29,22 @@ class BigBirdTokenizer(PretrainedTokenizer):
     Constructs a BigBird tokenizer. It uses a basic tokenizer to do punctuation
     splitting, lower casing and so on, and follows a WordPiece tokenizer to
     tokenize as subwords.
+
     Args:
-        sentencepiece_model_file(str): file path of the vocabulary
-        do_lower_case (bool): Whether the text strips accents and convert to
+        sentencepiece_model_file (`str`): File path of the vocabulary
+        do_lower_case (`bool`): Whether the text strips accents and convert to
             lower case. If you use the BigBird pretrained model, lower is set to
             False when using the cased model, otherwise it is set to True.
-            Default: True.
-        unk_token (str): The special token for unkown words. Default: "[UNK]".
-        sep_token (str): The special token for separator token . Default: "[SEP]".
-        pad_token (str): The special token for padding. Default: "[PAD]".
-        cls_token (str): The special token for cls. Default: "[CLS]".
-        mask_token (str): The special token for mask. Default: "[MASK]".
+            Defaults to True.
+        unk_token (`str`): The special token for unkown words. Defaults to `[UNK]`.
+        sep_token (`str`): The special token for separator token . Defaults to `[SEP]`.
+        pad_token (`str`): The special token for padding. Defaults to `[PAD]`.
+        cls_token (`str`): The special token for cls. Defaults to `[CLS]`.
+        mask_token (`str`): The special token for mask. Defaults to `[MASK]`.
     
-    Examples:
+    Raises:
+        ValueError: If file sentencepiece_model_file doesn't exist. 
+
     """
     resource_files_names = {
         "sentencepiece_model_file": "sentencepiece_gpt2.model",
@@ -97,20 +100,22 @@ def __init__(self,
     @property
     def vocab_size(self):
         """
-        return the size of vocabulary.
+        Returns the size of vocabulary.
+
         Returns:
-            int: the size of vocabulary.
+            `Int`: The size of vocabulary.
         """
         return len(self.vocab)
 
     def _tokenize(self, text):
         """
         End-to-end tokenization for BigBird models.
+
         Args:
             text (str): The text to be tokenized.
         
         Returns:
-            list: A list of string representing converted tokens.
+            `List`: A list of string representing converted tokens.
         """
         if len(text) == 0:
             return []
@@ -129,11 +134,12 @@ def _tokenize(self, text):
     def __call__(self, text, pair_text=None):
         """
         End-to-end tokenization for BigBird models.
+
         Args:
             text (str): The text to be tokenized.
             pair_text(str):  The pair text to be tokenized.
         Returns:
-            list: A list of string representing converted tokens.
+            `List`: A list of string representing converted tokens.
         """
         return self._tokenize(text)
 
@@ -142,10 +148,11 @@ def convert_tokens_to_string(self, tokens):
         Converts a sequence of tokens (list of string) in a single string. Since
         the usage of WordPiece introducing `##` to concat subwords, also remove
         `##` when converting.
+
         Args:
             tokens (list): A list of string representing tokens to be converted.
         Returns:
-            str: Converted string from tokens.
+            `Str`: Converted string from tokens.
         """
         out_string = " ".join(tokens).replace(" ##", "").strip()
         return out_string
@@ -156,6 +163,19 @@ def encode(self,
                max_pred_len=None,
                masked_lm_prob=0.15):
         """
+        Returns a tuple containing the encoded sequence and mask information.
+
+        Args:
+            text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`):
+                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
+                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
+                method)
+            max_seq_len (:obj:`int`, `optional`, defaults to`None`):
+                If set to a number, will limit the total sequence returned so that it has a maximum length.
+                If set to None, will not limit the total sequence.
+            max_pred_len (:obj:`int`, `optional`, defaults to `None`):
+                If set to a number, will limit the mask sequence returned so that it has a maximum prediction length.
+                If set to None, will not limit the mask sequence.
         """
 
         def get_input_ids(text):
@@ -255,11 +275,11 @@ def num_special_tokens_to_add(self, pair=False):
             inside your training loop.
 
         Args:
-            pair: Returns the number of added tokens in the case of a sequence pair if set to True, returns the
-                number of added tokens in the case of a single sequence if set to False.
+            pair (`bool`): Returns the number of added tokens in the case of a sequence pair if set to True, returns the
+                number of added tokens in the case of a single sequence if set to False. 
 
         Returns:
-            Number of tokens added to sequences
+            `Int`: Number of tokens added to sequences
         """
         token_ids_0 = []
         token_ids_1 = []
@@ -273,9 +293,9 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         adding special tokens. 
         
         A BERT sequence has the following format:
-        ::
-            - single sequence: ``[CLS] X [SEP]``
-            - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+
+            - single sequence: `[CLS] X [SEP]`
+            - pair of sequences: `[CLS] A [SEP] B [SEP]`
 
         Args:
             token_ids_0 (:obj:`List[int]`):
@@ -284,7 +304,7 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of input_id with the appropriate special tokens.
+            `List[int]`: List of input_id with the appropriate special tokens.
         """
         if token_ids_1 is None:
             return [self.cls_id] + token_ids_0 + [self.sep_id]