From eb811857226d7c48e51b04fe5ab596db9b23ed12 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Mon, 10 Aug 2020 12:33:00 +0800
Subject: [PATCH 01/14] Add RNN related apis in paddl.nn test=develop

---
 python/paddle/nn/layer/__init__.py |    1 +
 python/paddle/nn/layer/rnn.py      | 1190 +++++++++++++++++++++++++++-
 2 files changed, 1187 insertions(+), 4 deletions(-)

diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index 4963ac360804f..0b8f9fa5a0bc9 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -27,6 +27,7 @@
 from .extension import *
 from .activation import *
 from .norm import *
+from .rnn import *
 # from .activation import PReLU        #DEFINE_ALIAS
 from .activation import ReLU  #DEFINE_ALIAS
 from .activation import LeakyReLU  #DEFINE_ALIAS
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index 4717609503f7f..b4ce7678cb747 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -12,10 +12,1192 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define classes of recurrent neural network  
+# TODO: define classes of recurrent neural network
 
 __all__ = [
-    #       'RNNCell',
-    #       'GRUCell',
-    #       'LSTMCell'
+    'RNNCellBase',
+    'LSTMCell',
+    'GRUCell',
+    'StackedRNNCell',
+    'StackedLSTMCell',
+    'stackedGRUCell',
+    'RNN',
+    'BidirectionalRNN',
+    'LSTM',
+    'GRU',
 ]
+
+import copy
+import collections
+import itertools
+import six
+import sys
+import warnings
+from functools import partial, reduce
+
+import numpy as np
+
+from ... import fluid
+from ...fluid import layers
+from ...fluid.data_feeder import convert_dtype
+from ...fluid.dygraph import Layer, LayerList
+from ...fluid.param_attr import ParamAttr
+from ...fluid.layers import utils, BeamSearchDecoder
+from ...fluid.layers.utils import map_structure, flatten, pack_sequence_as
+
+
+class RNNCellBase(Layer):
+    """
+    RNNCellBase is the base class for abstraction representing the calculations
+    mapping the input and state to the output and new state. It is suitable to
+    and mostly used in RNN.
+    """
+
+    def get_initial_states(self,
+                           batch_ref,
+                           shape=None,
+                           dtype=None,
+                           init_value=0,
+                           batch_dim_idx=0):
+        """
+        Generate initialized states according to provided shape, data type and
+        value.
+        Parameters:
+            batch_ref: A (possibly nested structure of) tensor variable[s].
+                The first dimension of the tensor will be used as batch size to
+                initialize states.
+            shape: A (possibly nested structure of) shape[s], where a shape is
+                represented as a list/tuple of integer). -1(for batch size) will
+                beautomatically inserted if shape is not started with it. If None,
+                property `state_shape` will be used. The default value is None.
+            dtype: A (possibly nested structure of) data type[s]. The structure
+                must be same as that of `shape`, except when all tensors' in states
+                has the same data type, a single data type can be used. If None and
+                property `cell.state_shape` is not available, float32 will be used
+                as the data type. The default value is None.
+            init_value: A float value used to initialize states.
+            batch_dim_idx: An integer indicating which dimension of the tensor in
+                inputs represents batch size.  The default value is 0.
+        Returns:
+            Variable: tensor variable[s] packed in the same structure provided \
+                by shape, representing the initialized states.
+        """
+        # TODO: use inputs and batch_size
+        batch_ref = flatten(batch_ref)[0]
+
+        def _is_shape_sequence(seq):
+            if sys.version_info < (3, ):
+                integer_types = (
+                    int,
+                    long, )
+            else:
+                integer_types = (int, )
+            """For shape, list/tuple of integer is the finest-grained objection"""
+            if (isinstance(seq, list) or isinstance(seq, tuple)):
+                if reduce(lambda flag, x: isinstance(x, integer_types) and flag,
+                          seq, True):
+                    return False
+            # TODO: Add check for the illegal
+            if isinstance(seq, dict):
+                return True
+            return (isinstance(seq, collections.Sequence) and
+                    not isinstance(seq, six.string_types))
+
+        class Shape(object):
+            def __init__(self, shape):
+                self.shape = shape if shape[0] == -1 else ([-1] + list(shape))
+
+        # nested structure of shapes
+        states_shapes = self.state_shape if shape is None else shape
+        is_sequence_ori = utils.is_sequence
+        utils.is_sequence = _is_shape_sequence
+        states_shapes = map_structure(lambda shape: Shape(shape), states_shapes)
+        utils.is_sequence = is_sequence_ori
+
+        # nested structure of dtypes
+        try:
+            states_dtypes = self.state_dtype if dtype is None else dtype
+        except NotImplementedError:  # use fp32 as default
+            states_dtypes = "float32"
+        if len(flatten(states_dtypes)) == 1:
+            dtype = flatten(states_dtypes)[0]
+            states_dtypes = map_structure(lambda shape: dtype, states_shapes)
+
+        init_states = map_structure(
+            lambda shape, dtype: layers.fill_constant_batch_size_like(
+                input=batch_ref,
+                shape=shape.shape,
+                dtype=dtype,
+                value=init_value,
+                input_dim_idx=batch_dim_idx), states_shapes, states_dtypes)
+        return init_states
+
+    @property
+    def state_shape(self):
+        """
+        Abstract method (property).
+        Used to initialize states.
+        A (possiblely nested structure of) shape[s], where a shape is represented
+        as a list/tuple of integers (-1 for batch size would be automatically
+        inserted into a shape if shape is not started with it).
+        Not necessary to be implemented if states are not initialized by
+        `get_initial_states` or the `shape` argument is provided when using
+        `get_initial_states`.
+        """
+        raise NotImplementedError(
+            "Please add implementaion for `state_shape` in the used cell.")
+
+    @property
+    def state_dtype(self):
+        """
+        Abstract method (property).
+        Used to initialize states.
+        A (possiblely nested structure of) data types[s]. The structure must be
+        same as that of `shape`, except when all tensors' in states has the same
+        data type, a signle data type can be used.
+        Not necessary to be implemented if states are not initialized
+        by `get_initial_states` or the `dtype` argument is provided when using
+        `get_initial_states`.
+        """
+        raise NotImplementedError(
+            "Please add implementaion for `state_dtype` in the used cell.")
+
+
+class LSTMCell(RNNCellBase):
+    """
+    Long-Short Term Memory(LSTM) RNN cell.
+
+    The formula used is as follows:
+
+    .. math::
+        i_{t} & = \sigma(W_{x_{i}}x_{t} + b_{x_{i}} + W_{h_{i}}h_{t-1} + b_{h_{i}})
+        f_{t} & = \sigma(W_{x_{f}}x_{t} + b_{x_{f}} + W_{h_{f}}h_{t-1} + b_{h_{f}})
+        o_{t} & = \sigma(W_{x_{o}}x_{t} + b_{x_{o}} + W_{h_{o}}h_{t-1} + b_{h_{o}})
+        c_{t} & = f_{t}c_{t-1} + i_{t} \\tanh (W_{x_{c}}x_{t} + b_{x_{c}} + W_{h_{c}}h_{t-1} + b_{h_{c}})
+        h_{t} & = o_{t} \\tanh (c_{t})
+
+    Please refer to `An Empirical Exploration of Recurrent Network Architectures
+    <http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_ for more details.
+
+    Parameters:
+        input_size (int): The input size in the LSTM cell.
+        hidden_size (int): The hidden size in the LSTM cell.
+        param_attr(ParamAttr, optional): The parameter attribute for the learnable
+            weight matrix. Default: None.
+        bias_attr (ParamAttr, optional): The parameter attribute for the bias
+            of LSTM. Default: None.
+        dtype(string, optional): The data type used in this cell. Default float32.
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            inputs = paddle.rand((2, 4, 32))
+            cell = paddle.LSTMCell(input_size=32, hidden_size=64)
+            rnn = paddle.RNN(cell=cell)
+            outputs, _ = rnn(inputs)  # [2, 4, 64]
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype="float32"):
+        super(LSTMCell, self).__init__(dtype)
+
+        self.hidden_size = hidden_size
+        self.input_size = input_size
+        self._gate_activation = getattr(layers, "sigmoid")
+        self._activation = getattr(layers, "tanh")
+        self._param_attr = ParamAttr._to_attr(param_attr)
+        self._bias_attr = ParamAttr._to_attr(bias_attr)
+        self._dtype = dtype
+
+        if self._param_attr and self._param_attr.name is not None:
+            weight_ih_param_attr = copy.deepcopy(self._param_attr)
+            weight_hh_param_attr = copy.deepcopy(self._param_attr)
+            weight_ih_param_attr.name += "_weight_ih"
+            weight_hh_param_attr.name += "_weight_hh"
+        else:
+            weight_ih_param_attr = self._param_attr
+            weight_hh_param_attr = self._param_attr
+
+        if self._bias_attr and self._bias_attr.name is not None:
+            bias_ih_param_attr = copy.deepcopy(self._bias_attr)
+            bias_hh_param_attr = copy.deepcopy(self._bias_attr)
+            bias_ih_param_attr.name += "_bias_ih"
+            bias_hh_param_attr.name += "_bias_hh"
+        else:
+            bias_ih_param_attr = self._bias_attr
+            bias_hh_param_attr = self._bias_attr
+
+        self.weight_ih = self.create_parameter(
+            attr=weight_ih_param_attr,
+            shape=[4 * hidden_size, input_size],
+            dtype=dtype)
+
+        self.weight_hh = self.create_parameter(
+            attr=weight_hh_param_attr,
+            shape=[4 * hidden_size, hidden_size],
+            dtype=dtype)
+
+        self.bias_ih = self.create_parameter(
+            attr=bias_ih_param_attr,
+            shape=[4 * hidden_size],
+            dtype=dtype,
+            is_bias=True)
+        self.bias_hh = self.create_parameter(
+            attr=bias_hh_param_attr,
+            shape=[4 * hidden_size],
+            dtype=dtype,
+            is_bias=True)
+
+    def forward(self, inputs, states):
+        """
+        Performs single step LSTM calculations.
+        Parameters:
+            inputs (Variable): A tensor with shape `[batch_size, input_size]`,
+                corresponding to :math:`x_t` in the formula. The data type
+                should be float32 or float64.
+            states (Variable): A list of containing two tensors, each shaped
+                `[batch_size, hidden_size]`, corresponding to :math:`h_{t-1}, c_{t-1}`
+                in the formula. The data type should be float32 or float64.
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
+                a tensor with shape `[batch_size, hidden_size]`, corresponding \
+                to :math:`h_{t}` in the formula; `new_states` is a list containing \
+                two tenser variables shaped `[batch_size, hidden_size]`, corresponding \
+                to :math:`h_{t}, c_{t}` in the formula. The data type of these \
+                tensors all is same as that of `states`.
+        """
+        pre_hidden, pre_cell = states
+        gates = layers.matmul(inputs, self.weight_ih, transpose_y=True)
+        if self.bias_ih:
+            gates = gates + self.bias_ih
+        gates += layers.matmul(pre_hidden, self.weight_hh, transpose_y=True)
+        if self.bias_hh:
+            gates = gates + self.bias_hh
+
+        chunked_gates = layers.split(gates, num_or_sections=4, dim=1)
+
+        i = self._gate_activation(chunked_gates[0])
+        f = self._gate_activation(chunked_gates[1])
+        o = self._gate_activation(chunked_gates[3])
+        c = f * pre_cell + i * self._activation(chunked_gates[2])
+        h = o * self._activation(c)
+
+        return h, [h, c]
+
+    @property
+    def state_shape(self):
+        """
+        The `state_shape` of BasicLSTMCell is a list with two shapes: `[[hidden_size], [hidden_size]]`
+        (-1 for batch size would be automatically inserted into shape). These two
+        shapes correspond to :math:`h_{t-1}` and :math:`c_{t-1}` separately.
+        """
+        return [[self.hidden_size], [self.hidden_size]]
+
+
+class GRUCell(RNNCellBase):
+    """
+    Gated Recurrent Unit (GRU) RNN cell.
+
+    The formula for GRU used is as follows:
+
+    .. math::
+
+        u_t & = \sigma(W_{x_{u}}x_{t} + b_{x_{u}} + W_{h_{u}}h_{t-1} + b_{h_{u}})
+
+        r_t & = \sigma(W_{x_{r}}x_{t} + b_{x_{r}} + W_{h_{r}}h_{t-1} + b_{h_{r}})
+
+        \\tilde{h_t} & = \\tanh(W_{x_{c}}x_{t} + r_t \odot (W_{h_{c}}h_{t-1} + b_{h_{c}})
+
+        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
+
+    Please refer to `An Empirical Exploration of Recurrent Network Architectures
+    <http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_ for more details.
+
+    Parameters:
+        input_size (int): The input size for the first GRU cell.
+        hidden_size (int): The hidden size for every GRU cell.
+        param_attr(ParamAttr, optional): The parameter attribute for the learnable
+            weight matrix. Default: None.
+        bias_attr (ParamAttr, optional): The parameter attribute for the bias
+            of LSTM. Default: None.
+        dtype(string, optional): The data type used in this cell. Default float32.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            inputs = paddle.rand((2, 4, 32))
+            cell = BasicGRUCell(input_size=32, hidden_size=64)
+            rnn = RNN(cell=cell)
+            outputs, _ = rnn(inputs)  # [2, 4, 64]
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype='float32'):
+        super(GRUCell, self).__init__()
+
+        self.hidden_size = hidden_size
+        self.input_size = input_size
+        self._gate_activation = getattr(layers, "sigmoid")
+        self._activation = getattr(layers, "tanh")
+        self._param_attr = ParamAttr._to_attr(param_attr)
+        self._bias_attr = ParamAttr._to_attr(bias_attr)
+        self._dtype = dtype
+
+        if self._param_attr and self._param_attr.name is not None:
+            weight_ih_param_attr = copy.deepcopy(self._param_attr)
+            weight_hh_param_attr = copy.deepcopy(self._param_attr)
+            weight_ih_param_attr.name += "_weight_ih"
+            weight_hh_param_attr.name += "_weight_hh"
+        else:
+            weight_ih_param_attr = self._param_attr
+            weight_hh_param_attr = self._param_attr
+
+        if self._bias_attr and self._bias_attr.name is not None:
+            bias_ih_param_attr = copy.deepcopy(self._bias_attr)
+            bias_hh_param_attr = copy.deepcopy(self._bias_attr)
+            bias_ih_param_attr.name += "_bias_ih"
+            bias_hh_param_attr.name += "_bias_hh"
+        else:
+            bias_ih_param_attr = self._bias_attr
+            bias_hh_param_attr = self._bias_attr
+
+        self.weight_ih = self.create_parameter(
+            attr=weight_ih_param_attr,
+            shape=[3 * hidden_size, input_size],
+            dtype=dtype)
+
+        self.weight_hh = self.create_parameter(
+            attr=weight_hh_param_attr,
+            shape=[3 * hidden_size, hidden_size],
+            dtype=dtype)
+
+        self.bias_ih = self.create_parameter(
+            attr=bias_ih_param_attr,
+            shape=[3 * hidden_size],
+            dtype=dtype,
+            is_bias=True)
+        self.bias_hh = self.create_parameter(
+            attr=bias_hh_param_attr,
+            shape=[3 * hidden_size],
+            dtype=dtype,
+            is_bias=True)
+
+    def forward(self, inputs, states):
+        """
+        Performs single step GRU calculations.
+
+        Parameters:
+            inputs (Variable): A tensor with shape `[batch_size, input_size]`,
+                corresponding to :math:`x_t` in the formula. The data type
+                should be float32 or float64.
+            states (Variable): A tensor with shape `[batch_size, hidden_size]`.
+                corresponding to :math:`h_{t-1}` in the formula. The data type
+                should be float32 or float64.
+
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` and \
+                `new_states` is the same tensor shaped `[batch_size, hidden_size]`, \
+                corresponding to :math:`h_t` in the formula. The data type of the \
+                tensor is same as that of `states`.        
+        """
+        pre_hidden = states
+
+        x_gates = layers.matmul(inputs, self.weight_ih, transpose_y=True)
+        if self.bias_ih:
+            x_gates = x_gates + self.bias_ih
+        h_gates = layers.matmul(pre_hidden, self.weight_hh, transpose_y=True)
+        if self.bias_hh:
+            h_gates = h_gates + self.bias_hh
+
+        x_u, x_r, x_c = layers.split(x_gates, num_or_sections=3, dim=1)
+        h_u, h_r, h_c = layers.split(x_gates, num_or_sections=3, dim=1)
+
+        u = self._gate_activation(x_u + h_u)
+        r = self._gate_activation(x_r + h_r)
+        h_c = r * h_c
+        c = self._activation(x_c + h_c)
+        h = u * pre_hidden + (1 - u) * c
+
+        return h, h
+
+    @property
+    def state_shape(self):
+        """
+        The `state_shape` of BasicGRUCell is a shape `[hidden_size]` (-1 for batch
+        size would be automatically inserted into shape). The shape corresponds
+        to :math:`h_{t-1}`.
+        """
+        return [self._hidden_size]
+
+
+class StackedRNNCell(RNNCellBase):
+    """
+    Wrapper allowing a stack of RNN cells to behave as a single cell. It is used
+    to implement stacked RNNs.
+
+    Parameters:
+        cells (list|tuple): List of RNN cell instances.
+
+    Examples:
+        .. code-block:: python
+            from paddle import LSTMCell, StackedRNNCell
+            cells = [LSTMCell(32, 32), LSTMCell(32, 32)]
+            stack_rnn = StackedRNNCell(cells)
+    """
+
+    def __init__(self, cells):
+        super(StackedRNNCell, self).__init__()
+        self.cells = []
+        for i, cell in enumerate(cells):
+            self.cells.append(self.add_sublayer("cell_%d" % i, cell))
+
+    def forward(self, inputs, states, **kwargs):
+        """
+        Performs :code:`cell.forward` for all including cells sequentially.
+        Each cell's `inputs` is the `outputs` of the previous cell. And each
+        cell's `states` is the corresponding one in `states`.
+
+        Parameters:
+            inputs (Variable): The inputs for the first cell. Mostly it is a
+                float32 or float64 tensor with shape `[batch_size, input_size]`.
+            states (list): A list containing states for all cells orderly.
+
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ). `outputs` is the \
+                `outputs` of the last cell. `new_states` is a list composed \
+                of all cells' `new_states`, and its structure and data type is \
+                same as that of `states` argument.
+        """
+        new_states = []
+        for cell, state in zip(self.cells, states):
+            outputs, new_state = cell(inputs, state, **kwargs)
+            inputs = outputs
+            new_states.append(new_state)
+        return outputs, new_states
+
+    @staticmethod
+    def stack_param_attr(param_attr, n):
+        """
+        If `param_attr` is a list or tuple, convert every element in it to a
+        ParamAttr instance. Otherwise, repeat `param_attr` `n` times to
+        construct a list, and rename every one by appending a increasing index
+        suffix to avoid having same names when `param_attr` contains a name.
+        Parameters:
+            param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+                converted to a ParamAttr instance by `ParamAttr._to_attr`.
+            n (int): The times to repeat to construct a list when `param_attr`
+                is not a list or tuple.
+        Returns:
+            list: A list composed of each including cell's `param_attr`.
+        """
+        if isinstance(param_attr, (list, tuple)):
+            assert len(param_attr) == n, (
+                "length of param_attr should be %d when it is a list/tuple" % n)
+            param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr]
+        else:
+            param_attrs = []
+            attr = ParamAttr._to_attr(param_attr)
+            for i in range(n):
+                attr_i = copy.deepcopy(attr)
+                if attr.name:
+                    attr_i.name = attr_i.name + "_" + str(i)
+                param_attrs.append(attr_i)
+        return param_attrs
+
+    @property
+    def state_shape(self):
+        """
+        The `state_shape` of StackedRNNCell is a list composed of each including
+        cell's `state_shape`.
+        Returns:
+            list: A list composed of each including cell's `state_shape`.
+        """
+        return [cell.state_shape for cell in self.cells]
+
+
+class StackedLSTMCell(RNNCellBase):
+    """
+    Wrapper allowing a stack of LSTM cells to behave as a single cell. It is used
+    to implement stacked LSTM.
+
+    The formula for LSTM used here is as follows:
+
+    .. math::
+        i_{t} & = \sigma(W_{x_{i}}x_{t} + b_{x_{i}} + W_{h_{i}}h_{t-1} + b_{h_{i}})
+        f_{t} & = \sigma(W_{x_{f}}x_{t} + b_{x_{f}} + W_{h_{f}}h_{t-1} + b_{h_{f}})
+        o_{t} & = \sigma(W_{x_{o}}x_{t} + b_{x_{o}} + W_{h_{o}}h_{t-1} + b_{h_{o}})
+        c_{t} & = f_{t}c_{t-1} + i_{t} \\tanh (W_{x_{c}}x_{t} + b_{x_{c}} + W_{h_{c}}h_{t-1} + b_{h_{c}})
+        h_{t} & = o_{t} \\tanh (c_{t})
+
+    Parameters:
+        input_size (int): The input size for the first LSTM cell.
+        hidden_size (int): The hidden size for every LSTM cell.
+        num_layers(int, optional): The number of LSTM to be stacked. Default 1.
+        dropout(float, optional): The dropout probability applied on the outputs
+            of each LSTM cell except the last one. 0 for no dropout. Default 0.0
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            inputs = paddle.rand((2, 4, 32))
+            cell = paddle.StackedLSTMCell(input_size=32, hidden_size=64)
+            rnn = paddle.RNN(cell=cell)
+            outputs, _ = rnn(inputs)  # [2, 4, 64]
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 dropout=0.0,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype="float32"):
+        super(StackedLSTMCell, self).__init__()
+        self.hidden_size = hidden_size
+        self.input_size = input_size
+        self.num_layers = num_layers
+        self.dropout = dropout
+        param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers)
+        bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers)
+
+        self.cells = []
+        for i in range(num_layers):
+            self.cells.append(
+                self.add_sublayer(
+                    "lstm_%d" % i,
+                    LSTMCell(
+                        input_size=input_size if i == 0 else hidden_size,
+                        hidden_size=hidden_size,
+                        param_attr=param_attrs[i],
+                        bias_attr=bias_attrs[i],
+                        dtype=dtype)))
+
+    def forward(self, inputs, states):
+        """
+        Performs the stacked LSTM cells sequentially. Each cell's `inputs` is
+        the `outputs` of the previous cell. And each cell's `states` is the
+        corresponding one in `states`.
+
+        Parameters:
+            inputs (Variable): The inputs for the first cell. It is a float32 or
+                float64 tensor with shape `[batch_size, input_size]`.
+            states (list): A list containing states for all cells orderly.
+
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
+                a tensor with shape `[batch_size, hidden_size]`, corresponding \
+                to :math:`h_{t}` in the formula of the last LSTM; `new_states` \
+                is a list composed of every LSTM `new_states` which is a pair \
+                of tensors standing for :math:`h_{t}, c_{t}` in the formula, \
+                and the data type and structure of these tensors all is same \
+                as that of `states`.
+        """
+        new_states = []
+        for i, cell in enumerate(self.cells):
+            outputs, new_state = cell(inputs, states[i])
+            outputs = layers.dropout(
+                outputs,
+                self.dropout,
+                dropout_implementation='upscale_in_train'
+            ) if self.dropout and i != (self.num_layers - 1) else outputs
+            inputs = outputs
+            new_states.append(new_state)
+        # TODO(guosheng): maybe should stack list of states as one tensor
+        return outputs, new_states
+
+    @property
+    def state_shape(self):
+        """
+        The `state_shape` of StackedLSTMCell is a list composed of each including
+        LSTM cell's `state_shape`.
+        Returns:
+            list: A list composed of each including LSTM cell's `state_shape`.
+        """
+        return [cell.state_shape for cell in self.cells]
+
+
+class StackedGRUCell(RNNCellBase):
+    """
+    Wrapper allowing a stack of GRU cells to behave as a single cell. It is used
+    to implement stacked GRU.
+
+    The formula for GRU used here is as follows:
+
+    .. math::
+
+        u_t & = \sigma(W_{x_{u}}x_{t} + b_{x_{u}} + W_{h_{u}}h_{t-1} + b_{h_{u}})
+
+        r_t & = \sigma(W_{x_{r}}x_{t} + b_{x_{r}} + W_{h_{r}}h_{t-1} + b_{h_{r}})
+
+        \\tilde{h_t} & = \\tanh(W_{x_{c}}x_{t} + r_t \odot (W_{h_{c}}h_{t-1} + b_{h_{c}})
+
+        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
+
+
+    Parameters:
+        input_size (int): The input size for the first GRU cell.
+        hidden_size (int): The hidden size for every GRU cell.
+        num_layers(int, optional): The number of GRU to be stacked. Default 1.
+        dropout(float, optional): The dropout probability applied on the outputs
+            of each GRU cell except the last one. 0 for no dropout. Default 0.0
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            inputs = paddle.rand((2, 4, 32))
+            cell = paddle.StackedGRUCell(input_size=32, hidden_size=64)
+            rnn = paddle.RNN(cell=cell)
+            outputs, _ = rnn(inputs)  # [2, 4, 64]
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 dropout=0.0,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype="float32"):
+        super(StackedGRUCell, self).__init__()
+        self.hidden_size = hidden_size
+        self.input_size = input_size
+        self.num_layers = num_layers
+        self.dropout = dropout
+        param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers)
+        bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers)
+
+        self.cells = []
+        for i in range(num_layers):
+            self.cells.append(
+                self.add_sublayer(
+                    "gru_%d" % i,
+                    GRUCell(
+                        input_size=input_size if i == 0 else hidden_size,
+                        hidden_size=hidden_size,
+                        param_attr=param_attrs[i],
+                        bias_attr=bias_attrs[i],
+                        dtype=dtype)))
+
+    def forward(self, inputs, states):
+        """
+        Performs the stacked GRU cells sequentially. Each cell's `inputs` is
+        the `outputs` of the previous cell. And each cell's `states` is the
+        corresponding one in `states`.
+
+        Parameters:
+            inputs (Variable): The inputs for the first cell. It is a float32 or
+                float64 tensor with shape `[batch_size, input_size]`.
+            states (list): A list containing states for all cells orderly.
+
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
+                a tensor with shape `[batch_size, hidden_size]`, corresponding \
+                to :math:`h_{t}` in the formula of the last GRU; `new_states` \
+                is a list composed of every GRU `new_states` which is also \
+                :math:`h_{t}` in the formula, and the data type and structure \
+                of these tensors all is same as that of `states`.
+        """
+        new_states = []
+        for i, cell in enumerate(self.cells):
+            outputs, new_state = cell(inputs, states[i])
+            outputs = layers.dropout(
+                outputs,
+                self.dropout,
+                dropout_implementation='upscale_in_train'
+            ) if self.dropout and i != (self.num_layers - 1) else outputs
+            inputs = outputs
+            new_states.append(new_state)
+        return outputs, new_states
+
+    @property
+    def state_shape(self):
+        """
+        The `state_shape` of StackedGRUCell is a list composed of each including
+        GRU cell's `state_shape`.
+
+        Returns:
+            list: A list composed of each including GRU cell's `state_shape`.
+        """
+        return [cell.state_shape for cell in self.cells]
+
+
+class RNN(Layer):
+    """
+    RNN creates a recurrent neural network specified by RNNCell `cell`, which
+    performs :code:`cell.forward()` repeatedly until reaches to the maximum
+    length of `inputs`.
+
+    Parameters:
+        cell(RNNCell): An instance of `RNNCell`.
+        is_reverse (bool, optional): Indicate whether to calculate in the reverse
+            order of input sequences. Default: `False`.
+        time_major (bool, optional): Indicate the data layout of Tensor included
+            in `input` and `output` tensors. If `False`, the data layout would
+            be batch major with shape `[batch_size, sequence_length, ...]`.  If
+            `True`, the data layout would be time major with shape
+            `[sequence_length, batch_size, ...]`. Default: `False`.
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            inputs = paddle.rand((2, 4, 32))
+            cell = paddle.StackedLSTMCell(input_size=32, hidden_size=64)
+            rnn = paddle.RNN(cell=cell)
+            outputs, _ = rnn(inputs)  # [2, 4, 64]
+    """
+
+    def __init__(self, cell, is_reverse=False, time_major=False):
+        super(RNN, self).__init__()
+        self.cell = cell
+        if not hasattr(self.cell, "call"):
+            # for non-dygraph mode, `rnn` api uses cell.call
+            self.cell.call = self.cell.forward
+        self.is_reverse = is_reverse
+        self.time_major = time_major
+        self.batch_index, self.time_step_index = (1, 0) if time_major else (0,
+                                                                            1)
+
+    def forward(self,
+                inputs,
+                initial_states=None,
+                sequence_length=None,
+                **kwargs):
+        """
+        Performs :code:`cell.forward()` repeatedly until reaches to the maximum
+        length of `inputs`.
+        Parameters:
+            inputs (Variable): A (possibly nested structure of) tensor variable[s]. 
+                The shape of tensor should be `[batch_size, sequence_length, ...]`
+                for `time_major == False` or `[sequence_length, batch_size, ...]`
+                for `time_major == True`. It represents the inputs to be unrolled
+                in RNN.
+            initial_states (Variable, optional): A (possibly nested structure of)
+                tensor variable[s], representing the initial state for RNN. 
+                If not provided, `cell.get_initial_states` would be used to produce
+                the initial state. Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
+            **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`. 
+        Returns:
+            tuple: A tuple( :code:`(final_outputs, final_states)` ) including the final \
+                outputs and states, both are Tensor or nested structure of Tensor. \
+                `final_outputs` has the same structure and data types as \
+                the returned `outputs` of :code:`cell.forward` , and each Tenser in `final_outputs` \
+                stacks all time steps' counterpart in `outputs` thus has shape `[batch_size, sequence_length, ...]` \
+                for `time_major == False` or `[sequence_length, batch_size, ...]` for `time_major == True`. \
+                `final_states` is the counterpart at last time step of initial states, \
+                thus has the same structure with it and has tensors with same shapes \
+                and data types.
+        """
+        flat_inputs = flatten(inputs)
+        batch_size, time_steps = (flat_inputs[0].shape[self.batch_index],
+                                  flat_inputs[0].shape[self.time_step_index])
+
+        if initial_states is None:
+            initial_states = self.cell.get_initial_states(
+                batch_ref=inputs,
+                dtype=self.cell.dtype if hasattr(self.cell, "dtype") else
+                self.cell.parameters()[0].dtype,
+                batch_dim_idx=self.batch_index)
+
+        if fluid.in_dygraph_mode():
+
+            class ArrayWrapper(object):
+                def __init__(self, x):
+                    self.array = [x]
+
+                def append(self, x):
+                    self.array.append(x)
+                    return self
+
+            def _maybe_copy(state, new_state, step_mask):
+                # TODO: use where_op
+                new_state = layers.elementwise_mul(
+                    new_state, step_mask, axis=0) - layers.elementwise_mul(
+                        state, (step_mask - 1), axis=0)
+                return new_state
+
+            if not self.time_major:
+                inputs = map_structure(
+                    lambda x: layers.transpose(x, [1, 0] + list(
+                        range(2, len(x.shape)))), inputs)
+
+            if sequence_length is not None:
+                mask = layers.sequence_mask(
+                    sequence_length,
+                    maxlen=time_steps,
+                    dtype=flatten(initial_states)[0].dtype)
+                mask = layers.transpose(mask, [1, 0])
+
+            if self.is_reverse:
+                inputs = map_structure(lambda x: layers.reverse(x, axis=[0]),
+                                       inputs)
+                mask = layers.reverse(
+                    mask, axis=[0]) if sequence_length is not None else None
+
+            states = initial_states
+            outputs = []
+            for i in range(time_steps):
+                step_inputs = map_structure(lambda x: x[i], inputs)
+                step_outputs, new_states = self.cell(step_inputs, states,
+                                                     **kwargs)
+                if sequence_length is not None:
+                    new_states = map_structure(
+                        partial(
+                            _maybe_copy, step_mask=mask[i]),
+                        states,
+                        new_states)
+                states = new_states
+                outputs = map_structure(
+                    lambda x: ArrayWrapper(x),
+                    step_outputs) if i == 0 else map_structure(
+                        lambda x, x_array: x_array.append(x), step_outputs,
+                        outputs)
+
+            final_outputs = map_structure(
+                lambda x: layers.stack(x.array, axis=self.time_step_index),
+                outputs)
+
+            if self.is_reverse:
+                final_outputs = map_structure(
+                    lambda x: layers.reverse(x, axis=self.time_step_index),
+                    final_outputs)
+
+            final_states = new_states
+        else:
+            final_outputs, final_states = layers.rnn(
+                self.cell,
+                inputs,
+                initial_states=initial_states,
+                sequence_length=sequence_length,
+                time_major=self.time_major,
+                is_reverse=self.is_reverse,
+                **kwargs)
+        return final_outputs, final_states
+
+
+class BidirectionalRNN(Layer):
+    """
+    Wrapper for bidirectional RNN. It assembles two RNNCell instances to perform
+    forward and backward RNN separately, and merge outputs of these two RNN
+    according to `merge_mode`.
+    Parameters:
+        cell_fw (RNNCell): A RNNCell instance used for forward RNN.
+        cell_bw (RNNCell): A RNNCell instance used for backward RNN.
+        merge_mode (str|None, optional): The way to merget outputs of forward and
+            backward RNN. It can be `concat`, `sum`, `ave`, `mul`, `zip` and None,
+            where None stands for make the two `outputs` as a tuple, `zip` stands
+            for make each two corresponding tensors of the two `outputs` as a tuple.
+            Default `concat`
+    Examples:
+        .. code-block:: python
+            import paddle
+            from paddle.incubate.hapi.text import StackedLSTMCell, BidirectionalRNN
+            inputs = paddle.rand((2, 4, 32))
+            cell_fw = StackedLSTMCell(32, 64)
+            cell_bw = StackedLSTMCell(32, 64)
+            bi_rnn = BidirectionalRNN(cell_fw, cell_bw)
+            outputs, _ = bi_rnn(inputs)  # [2, 4, 128]
+    """
+
+    def __init__(self,
+                 cell_fw,
+                 cell_bw,
+                 merge_mode='concat',
+                 time_major=False,
+                 cell_cls=None,
+                 **kwargs):
+        super(BidirectionalRNN, self).__init__()
+        self.rnn_fw = RNN(cell_fw, is_reverse=False, time_major=time_major)
+        self.rnn_bw = RNN(cell_bw, is_reverse=True, time_major=time_major)
+        if merge_mode == 'concat':
+            self.merge_func = lambda x, y: layers.concat([x, y], -1)
+        elif merge_mode == 'sum':
+            self.merge_func = lambda x, y: layers.elementwise_add(x, y)
+        elif merge_mode == 'ave':
+            self.merge_func = lambda x, y: layers.scale(
+                layers.elementwise_add(x, y), 0.5)
+        elif merge_mode == 'mul':
+            self.merge_func = lambda x, y: layers.elementwise_mul(x, y)
+        elif merge_mode == 'zip':
+            self.merge_func = lambda x, y: (x, y)
+        elif merge_mode is None:
+            self.merge_func = None
+        else:
+            raise ValueError('Unsupported value for `merge_mode`: %s' %
+                             merge_mode)
+
+    def forward(self,
+                inputs,
+                initial_states=None,
+                sequence_length=None,
+                **kwargs):
+        """
+        Performs forward and backward RNN separately, and merge outputs of these
+        two RNN according to `merge_mode`.
+        Parameters:
+            inputs (Variable): A (possibly nested structure of) tensor variable[s]. 
+                The shape of tensor should be `[batch_size, sequence_length, ...]`
+                for `time_major == False` or `[sequence_length, batch_size, ...]`
+                for `time_major == True`. It represents the inputs to be unrolled
+                in both forward and backward RNN.
+            initial_states (Variable|list|tuple): If it is a list or tuple, its
+                length should be 2 to include initial states of forward and backward
+                RNN separately. Otherwise it would be used twice for the two RNN. 
+                If None, `cell.get_initial_states` would be used to produce the initial
+                states. Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
+            **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`.
+        Returns:
+            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
+                is produced by merge outputs of forward and backward RNN according \
+                to `merge_mode`; similarly, `final_states` is produced by merge \
+                `final_states` of forward and backward RNN.
+        """
+        if isinstance(initial_states, (list, tuple)):
+            assert len(
+                initial_states
+            ) == 2, "length of initial_states should be 2 when it is a list/tuple"
+        else:
+            initial_states = [initial_states, initial_states]
+        outputs_fw, states_fw = self.rnn_fw(inputs, initial_states[0],
+                                            sequence_length, **kwargs)
+        outputs_bw, states_bw = self.rnn_bw(inputs, initial_states[1],
+                                            sequence_length, **kwargs)
+        outputs = map_structure(self.merge_func, outputs_fw,
+                                outputs_bw) if self.merge_func else (outputs_fw,
+                                                                     outputs_bw)
+        final_states = map_structure(
+            self.merge_func, states_fw,
+            states_bw) if self.merge_func else (states_fw, states_bw)
+        return outputs, final_states
+
+    @staticmethod
+    def bidirect_param_attr(param_attr):
+        """
+        Converts `param_attr` to a pair of `param_attr` when it is not a list
+        or tuple with length 2, also rename every one by appending a suffix to
+        avoid having same names when `param_attr` contains a name.
+
+        Parameters:
+            param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+                converted to a ParamAttr instance by `ParamAttr._to_attr`. When
+                it is a list or tuple, its length must be 2.
+
+        Returns:
+            list: A pair composed of forward and backward RNN cell's `param_attr`.
+        """
+        if isinstance(param_attr, (list, tuple)):
+            assert len(
+                param_attr
+            ) == 2, "length of param_attr should be 2 when it is a list/tuple"
+            param_attrs = param_attr
+        else:
+            param_attrs = []
+            attr = ParamAttr._to_attr(param_attr)
+            attr_fw = copy.deepcopy(attr)
+            if attr.name:
+                attr_fw.name = attr_fw.name + "_fw"
+            param_attrs.append(attr_fw)
+            attr_bw = copy.deepcopy(attr)
+            if attr.name:
+                attr_bw.name = attr_bw.name + "_bw"
+            param_attrs.append(attr_bw)
+        return param_attrs
+
+
+class LSTM(Layer):
+    """
+    Applies a stacked multi-layer long short-term memory (LSTM) RNN to an input
+    sequence.
+
+    The formula for LSTM used here is as follows:
+
+    .. math::
+        i_{t} & = \sigma(W_{x_{i}}x_{t} + b_{x_{i}} + W_{h_{i}}h_{t-1} + b_{h_{i}})
+        f_{t} & = \sigma(W_{x_{f}}x_{t} + b_{x_{f}} + W_{h_{f}}h_{t-1} + b_{h_{f}})
+        o_{t} & = \sigma(W_{x_{o}}x_{t} + b_{x_{o}} + W_{h_{o}}h_{t-1} + b_{h_{o}})
+        c_{t} & = f_{t}c_{t-1} + i_{t} \\tanh (W_{x_{c}}x_{t} + b_{x_{c}} + W_{h_{c}}h_{t-1} + b_{h_{c}})
+        h_{t} & = o_{t} \\tanh (c_{t})
+
+    Parameters:
+        input_size (int): The input feature size for the first LSTM.
+        hidden_size (int): The hidden size for every LSTM.
+        num_layers(int, optional): The number of LSTM to be stacked. Default 1.
+        dropout(float, optional): The dropout probability applied on the outputs
+            of each LSTM except the last one. 0 for not dropout. Default 0.0
+        direction (str, optional): Indicate the direction for LSTM calculation
+            applying on the input sequences. It can be `forward`, `backward` or
+            `bidirect`. If it is `backward`, calculate in the reverse order of
+            input sequences. If it is `bidirect`, each layer would be a 
+            bidirectional LSTM composed of a `forward` LSTM and `backward` LSTM,
+            and it concatenates their outputs as outputs. Default: `forward`.
+        time_major (bool, optional): Indicate the data layout of Tensor included
+            in `input` and `output` tensors. If `False`, the data layout would
+            be batch major with shape `[batch_size, sequence_length, ...]`.  If
+            `True`, the data layout would be time major with shape
+            `[sequence_length, batch_size, ...]`. Default: `False`.
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+    Examples:
+        .. code-block:: python
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import LSTM
+            inputs = paddle.rand((2, 4, 32))
+            lstm = LSTM(input_size=32, hidden_size=64, num_layers=2)
+            outputs, _ = lstm(inputs)  # [2, 4, 64]
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 dropout=0.0,
+                 direction="forward",
+                 time_major=False,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype='float32'):
+        super(LSTM, self).__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.dropout = dropout
+        self.direction = direction
+        self.num_directions = 2 if direction == 'bidirect' else 1
+        self.time_major = time_major
+
+        if direction == 'bidirect':
+            param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr)
+            bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr)
+            fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0],
+                                                             num_layers)
+            bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1],
+                                                             num_layers)
+            fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0],
+                                                            num_layers)
+            bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1],
+                                                            num_layers)
+
+            # maybe design cell including both forward and backward later
+            merge_mode = 'concat'
+            rnns = []
+            for i in range(num_layers):
+                cell_fw = StackedLSTMCell(input_size if i == 0 else (
+                    hidden_size * 2 if merge_mode == 'concat' else
+                    hidden_size), hidden_size, 1, dropout, fw_param_attrs[i],
+                                          fw_bias_attrs[i], dtype)
+                cell_bw = StackedLSTMCell(input_size if i == 0 else (
+                    hidden_size * 2 if merge_mode == 'concat' else
+                    hidden_size), hidden_size, 1, dropout, bw_param_attrs[i],
+                                          bw_bias_attrs[i], dtype)
+                rnns.append(
+                    BidirectionalRNN(
+                        cell_fw,
+                        cell_bw,
+                        merge_mode=merge_mode,
+                        time_major=time_major))
+            self.lstm = LayerList(rnns)
+        else:
+            lstm_cell = StackedLSTMCell(input_size, hidden_size, num_layers,
+                                        dropout, param_attr, bias_attr, dtype)
+            self.lstm = RNN(lstm_cell,
+                            is_reverse=(direction == "backward"),
+                            time_major=time_major)
+
+    def forward(self, input, initial_states=None, sequence_length=None):
+        """
+        Performs the stacked multi-layer LSTM layer by layer. Each LSTM's `outputs`
+        is the `inputs` of the subsequent one.
+        Parameters:
+            inputs (Variable): The inputs for the first LSTM. It is a float32
+                or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
+            initial_states (list|None, optional): A list containing initial states 
+                of all stacked LSTM, and the initial states of each LSTM is a pair
+                of tensors shaped `[batch_size, hidden_size]`. If not provided,
+                use 0 as initial states. Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
+        Returns:
+            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
+                is the output of last LSTM and it is a tensor with shape \
+                `[batch_size, sequence_length, hidden_size]` and has the same \
+                data type as `inputs`, `final_states` is the counterpart of \
+                `initial_states` at last time step, thus has the same structure \
+                with it and has tensors with same shapes data types. 
+        """
+        if not isinstance(self.lstm, LayerList):
+            return self.lstm(input, initial_states, sequence_length)
+        else:
+            if isinstance(initial_states, (list, tuple)):
+                assert len(initial_states) == self.num_layers, (
+                    "length of initial_states should be %d when it is a list|tuple"
+                    % self.num_layers)
+            else:
+                initial_states = [initial_states] * self.num_layers
+            stacked_states = []
+            for i in range(self.num_layers):
+                output, states = self.lstm[i](input, initial_states[i],
+                                              sequence_length)
+                input = output
+                stacked_states.append(states)
+            return output, stacked_states

From 47688544de190e1ca667cae60bad7b8da9d8d241 Mon Sep 17 00:00:00 2001
From: chenfeiyu <chenfeiyu@baidu.com>
Date: Sat, 15 Aug 2020 15:33:36 +0800
Subject: [PATCH 02/14] new rnn api, cell almost done

---
 python/paddle/nn/__init__.py       |    3 +
 python/paddle/nn/layer/__init__.py |    4 +-
 python/paddle/nn/layer/rnn.py      | 1465 ++++++++++++++--------------
 3 files changed, 728 insertions(+), 744 deletions(-)

diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 98948fa91e2e8..cd73ec1336a2b 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -18,6 +18,7 @@
 from .layer import norm
 from .functional import extension
 from .layer import common
+from .layer import rnn
 
 from . import initializer
 
@@ -25,6 +26,7 @@
 __all__ += norm.__all__
 __all__ += extension.__all__
 __all__ += common.__all__
+__all__ += rnn.__all__
 
 # TODO: define alias in nn directory
 # from .clip import ErrorClipByValue        #DEFINE_ALIAS
@@ -90,6 +92,7 @@
 from .layer.norm import LayerNorm  #DEFINE_ALIAS
 from .layer.norm import SpectralNorm  #DEFINE_ALIAS
 from .layer.norm import InstanceNorm  #DEFINE_ALIAS
+from .layer.rnn import *
 # from .layer.rnn import RNNCell        #DEFINE_ALIAS
 # from .layer.rnn import GRUCell        #DEFINE_ALIAS
 # from .layer.rnn import LSTMCell        #DEFINE_ALIAS
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index fbbcf048f2987..0d8687e91e29e 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -20,6 +20,7 @@
 from . import extension
 from . import activation
 from . import norm
+from . import rnn
 
 from .activation import *
 from .loss import *
@@ -67,6 +68,3 @@
 from .norm import LayerNorm  #DEFINE_ALIAS
 from .norm import SpectralNorm  #DEFINE_ALIAS
 from .norm import InstanceNorm  #DEFINE_ALIAS
-# from .rnn import RNNCell        #DEFINE_ALIAS
-# from .rnn import GRUCell        #DEFINE_ALIAS
-# from .rnn import LSTMCell        #DEFINE_ALIAS
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index b4ce7678cb747..b64f04b37879c 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -12,39 +12,40 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define classes of recurrent neural network
-
-__all__ = [
-    'RNNCellBase',
-    'LSTMCell',
-    'GRUCell',
-    'StackedRNNCell',
-    'StackedLSTMCell',
-    'stackedGRUCell',
-    'RNN',
-    'BidirectionalRNN',
-    'LSTM',
-    'GRU',
-]
-
 import copy
 import collections
 import itertools
 import six
+import math
 import sys
 import warnings
 from functools import partial, reduce
 
-import numpy as np
-
 from ... import fluid
 from ...fluid import layers
+from ...fluid import initializer as I
 from ...fluid.data_feeder import convert_dtype
 from ...fluid.dygraph import Layer, LayerList
 from ...fluid.param_attr import ParamAttr
 from ...fluid.layers import utils, BeamSearchDecoder
 from ...fluid.layers.utils import map_structure, flatten, pack_sequence_as
 
+# TODO: define classes of recurrent neural network
+
+__all__ = [
+    'RNNCellBase',
+    'SimpleRNNCell',
+    'LSTMCell',
+    'GRUCell',
+    'StackedRNNCell',
+    'StackedLSTMCell',
+    # 'stackedGRUCell',
+    'RNN',
+    'BidirectionalRNN',
+    'LSTM',
+    # 'GRU',
+]
+
 
 class RNNCellBase(Layer):
     """
@@ -163,6 +164,53 @@ def state_dtype(self):
             "Please add implementaion for `state_dtype` in the used cell.")
 
 
+class SimpleRNNCell(RNNCellBase):
+    def __init__(self, input_size, hidden_size, nonlinearity="tanh", name=None):
+        super(SimpleRNNCell, self).__init__()
+        std = 1.0 / math.sqrt(hidden_size)
+        self.weight_ih = self.create_parameter(
+            (hidden_size, input_size), default_initializer=I.Uniform(-std, std))
+        self.weight_hh = self.create_parameter(
+            (hidden_size, hidden_size),
+            default_initializer=I.Uniform(-std, std))
+        self.bias_ih = self.create_parameter(
+            (hidden_size, ),
+            is_bias=True,
+            default_initializer=I.Uniform(-std, std))
+        self.bias_hh = self.create_parameter(
+            (hidden_size, ),
+            is_bias=True,
+            default_initializer=I.Uniform(-std, std))
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        if nonlinearity not in ["tanh", "relu"]:
+            raise ValueError(
+                "nonlinearity for SimpleRNNCell should be tanh or relu, "
+                "but get {}".format(nonlinearity))
+        self.nonlinearity = nonlinearity
+        self._nonlinear_fn = layers.tanh \
+            if nonlinearity == "tanh" \
+            else layers.relu
+
+    def forward(self, inputs, states=None):
+        if states is None:
+            states = self.get_initial_states(inputs, self.state_shape)
+        pre_h = states
+        i2h = layers.matmul(inputs, self.weight_ih, transpose_y=True)
+        if self.bias_ih is not None:
+            i2h += self.bias_ih
+        h2h = layers.matmul(pre_h, self.weight_hh, transpose_y=True)
+        if self.bias_hh is not None:
+            h2h += self.bias_hh
+        h = self._nonlinear_fn(i2h + h2h)
+        return h, h
+
+    @property
+    def state_shape(self):
+        return (self.hidden_size, )
+
+
 class LSTMCell(RNNCellBase):
     """
     Long-Short Term Memory(LSTM) RNN cell.
@@ -197,69 +245,37 @@ class LSTMCell(RNNCellBase):
             outputs, _ = rnn(inputs)  # [2, 4, 64]
     """
 
-    def __init__(self,
-                 input_size,
-                 hidden_size,
-                 param_attr=None,
-                 bias_attr=None,
-                 dtype="float32"):
-        super(LSTMCell, self).__init__(dtype)
-
-        self.hidden_size = hidden_size
-        self.input_size = input_size
-        self._gate_activation = getattr(layers, "sigmoid")
-        self._activation = getattr(layers, "tanh")
-        self._param_attr = ParamAttr._to_attr(param_attr)
-        self._bias_attr = ParamAttr._to_attr(bias_attr)
-        self._dtype = dtype
-
-        if self._param_attr and self._param_attr.name is not None:
-            weight_ih_param_attr = copy.deepcopy(self._param_attr)
-            weight_hh_param_attr = copy.deepcopy(self._param_attr)
-            weight_ih_param_attr.name += "_weight_ih"
-            weight_hh_param_attr.name += "_weight_hh"
-        else:
-            weight_ih_param_attr = self._param_attr
-            weight_hh_param_attr = self._param_attr
-
-        if self._bias_attr and self._bias_attr.name is not None:
-            bias_ih_param_attr = copy.deepcopy(self._bias_attr)
-            bias_hh_param_attr = copy.deepcopy(self._bias_attr)
-            bias_ih_param_attr.name += "_bias_ih"
-            bias_hh_param_attr.name += "_bias_hh"
-        else:
-            bias_ih_param_attr = self._bias_attr
-            bias_hh_param_attr = self._bias_attr
-
+    def __init__(self, input_size, hidden_size, name=None):
+        super(LSTMCell, self).__init__()
+        std = 1.0 / math.sqrt(hidden_size)
         self.weight_ih = self.create_parameter(
-            attr=weight_ih_param_attr,
-            shape=[4 * hidden_size, input_size],
-            dtype=dtype)
-
+            (4 * hidden_size, input_size),
+            default_initializer=I.Uniform(-std, std))
         self.weight_hh = self.create_parameter(
-            attr=weight_hh_param_attr,
-            shape=[4 * hidden_size, hidden_size],
-            dtype=dtype)
-
+            (4 * hidden_size, hidden_size),
+            default_initializer=I.Uniform(-std, std))
         self.bias_ih = self.create_parameter(
-            attr=bias_ih_param_attr,
-            shape=[4 * hidden_size],
-            dtype=dtype,
-            is_bias=True)
+            (4 * hidden_size, ),
+            is_bias=True,
+            default_initializer=I.Uniform(-std, std))
         self.bias_hh = self.create_parameter(
-            attr=bias_hh_param_attr,
-            shape=[4 * hidden_size],
-            dtype=dtype,
-            is_bias=True)
+            (4 * hidden_size, ),
+            is_bias=True,
+            default_initializer=I.Uniform(-std, std))
 
-    def forward(self, inputs, states):
+        self.hidden_size = hidden_size
+        self.input_size = input_size
+        self._gate_activation = layers.sigmoid
+        self._activation = layers.tanh
+
+    def forward(self, inputs, states=None):
         """
         Performs single step LSTM calculations.
         Parameters:
             inputs (Variable): A tensor with shape `[batch_size, input_size]`,
                 corresponding to :math:`x_t` in the formula. The data type
                 should be float32 or float64.
-            states (Variable): A list of containing two tensors, each shaped
+            states (Variable): A tuple of two tensors, each shaped
                 `[batch_size, hidden_size]`, corresponding to :math:`h_{t-1}, c_{t-1}`
                 in the formula. The data type should be float32 or float64.
         Returns:
@@ -270,15 +286,17 @@ def forward(self, inputs, states):
                 to :math:`h_{t}, c_{t}` in the formula. The data type of these \
                 tensors all is same as that of `states`.
         """
+        if states is None:
+            states = self.get_initial_states(inputs, self.state_shape)
         pre_hidden, pre_cell = states
         gates = layers.matmul(inputs, self.weight_ih, transpose_y=True)
-        if self.bias_ih:
+        if self.bias_ih is not None:
             gates = gates + self.bias_ih
         gates += layers.matmul(pre_hidden, self.weight_hh, transpose_y=True)
-        if self.bias_hh:
+        if self.bias_hh is not None:
             gates = gates + self.bias_hh
 
-        chunked_gates = layers.split(gates, num_or_sections=4, dim=1)
+        chunked_gates = layers.split(gates, num_or_sections=4, dim=-1)
 
         i = self._gate_activation(chunked_gates[0])
         f = self._gate_activation(chunked_gates[1])
@@ -286,7 +304,7 @@ def forward(self, inputs, states):
         c = f * pre_cell + i * self._activation(chunked_gates[2])
         h = o * self._activation(c)
 
-        return h, [h, c]
+        return h, (h, c)
 
     @property
     def state_shape(self):
@@ -295,7 +313,7 @@ def state_shape(self):
         (-1 for batch size would be automatically inserted into shape). These two
         shapes correspond to :math:`h_{t-1}` and :math:`c_{t-1}` separately.
         """
-        return [[self.hidden_size], [self.hidden_size]]
+        return ((self.hidden_size, ), (self.hidden_size, ))
 
 
 class GRUCell(RNNCellBase):
@@ -337,62 +355,30 @@ class GRUCell(RNNCellBase):
             outputs, _ = rnn(inputs)  # [2, 4, 64]
     """
 
-    def __init__(self,
-                 input_size,
-                 hidden_size,
-                 param_attr=None,
-                 bias_attr=None,
-                 dtype='float32'):
+    def __init__(self, input_size, hidden_size, name=None):
         super(GRUCell, self).__init__()
-
-        self.hidden_size = hidden_size
-        self.input_size = input_size
-        self._gate_activation = getattr(layers, "sigmoid")
-        self._activation = getattr(layers, "tanh")
-        self._param_attr = ParamAttr._to_attr(param_attr)
-        self._bias_attr = ParamAttr._to_attr(bias_attr)
-        self._dtype = dtype
-
-        if self._param_attr and self._param_attr.name is not None:
-            weight_ih_param_attr = copy.deepcopy(self._param_attr)
-            weight_hh_param_attr = copy.deepcopy(self._param_attr)
-            weight_ih_param_attr.name += "_weight_ih"
-            weight_hh_param_attr.name += "_weight_hh"
-        else:
-            weight_ih_param_attr = self._param_attr
-            weight_hh_param_attr = self._param_attr
-
-        if self._bias_attr and self._bias_attr.name is not None:
-            bias_ih_param_attr = copy.deepcopy(self._bias_attr)
-            bias_hh_param_attr = copy.deepcopy(self._bias_attr)
-            bias_ih_param_attr.name += "_bias_ih"
-            bias_hh_param_attr.name += "_bias_hh"
-        else:
-            bias_ih_param_attr = self._bias_attr
-            bias_hh_param_attr = self._bias_attr
-
+        std = 1.0 / math.sqrt(hidden_size)
         self.weight_ih = self.create_parameter(
-            attr=weight_ih_param_attr,
-            shape=[3 * hidden_size, input_size],
-            dtype=dtype)
-
+            (3 * hidden_size, input_size),
+            default_initializer=I.Uniform(-std, std))
         self.weight_hh = self.create_parameter(
-            attr=weight_hh_param_attr,
-            shape=[3 * hidden_size, hidden_size],
-            dtype=dtype)
-
+            (3 * hidden_size, hidden_size),
+            default_initializer=I.Uniform(-std, std))
         self.bias_ih = self.create_parameter(
-            attr=bias_ih_param_attr,
-            shape=[3 * hidden_size],
-            dtype=dtype,
-            is_bias=True)
+            (3 * hidden_size, ),
+            is_bias=True,
+            default_initializer=I.Uniform(-std, std))
         self.bias_hh = self.create_parameter(
-            attr=bias_hh_param_attr,
-            shape=[3 * hidden_size],
-            dtype=dtype,
-            is_bias=True)
+            (3 * hidden_size, ),
+            is_bias=True,
+            default_initializer=I.Uniform(-std, std))
 
-    def forward(self, inputs, states):
+        self.hidden_size = hidden_size
+        self.input_size = input_size
+        self._gate_activation = layers.sigmoid
+        self._activation = layers.tanh
+
+    def forward(self, inputs, states=None):
         """
         Performs single step GRU calculations.
 
@@ -410,23 +396,24 @@ def forward(self, inputs, states):
                 corresponding to :math:`h_t` in the formula. The data type of the \
                 tensor is same as that of `states`.        
         """
-        pre_hidden = states
+        if states is None:
+            states = self.get_initial_states(inputs, self.state_shape)
 
+        pre_hidden = states
         x_gates = layers.matmul(inputs, self.weight_ih, transpose_y=True)
-        if self.bias_ih:
+        if self.bias_ih is not None:
             x_gates = x_gates + self.bias_ih
         h_gates = layers.matmul(pre_hidden, self.weight_hh, transpose_y=True)
-        if self.bias_hh:
+        if self.bias_hh is not None:
             h_gates = h_gates + self.bias_hh
 
-        x_u, x_r, x_c = layers.split(x_gates, num_or_sections=3, dim=1)
-        h_u, h_r, h_c = layers.split(x_gates, num_or_sections=3, dim=1)
+        x_r, x_z, x_c = layers.split(x_gates, num_or_sections=3, dim=1)
+        h_r, h_z, h_c = layers.split(h_gates, num_or_sections=3, dim=1)
 
-        u = self._gate_activation(x_u + h_u)
         r = self._gate_activation(x_r + h_r)
-        h_c = r * h_c
-        c = self._activation(x_c + h_c)
-        h = u * pre_hidden + (1 - u) * c
+        z = self._gate_activation(x_z + h_z)
+        c = self._activation(x_c + r * h_c)  # apply reset gate after mm
+        h = (pre_hidden - c) * z + c
 
         return h, h
 
@@ -437,235 +424,330 @@ def state_shape(self):
         size would be automatically inserted into shape). The shape corresponds
         to :math:`h_{t-1}`.
         """
-        return [self._hidden_size]
+        return (self.hidden_size, )
 
 
-class StackedRNNCell(RNNCellBase):
+class RNN(Layer):
     """
-    Wrapper allowing a stack of RNN cells to behave as a single cell. It is used
-    to implement stacked RNNs.
+    RNN creates a recurrent neural network specified by RNNCell `cell`, which
+    performs :code:`cell.forward()` repeatedly until reaches to the maximum
+    length of `inputs`.
 
     Parameters:
-        cells (list|tuple): List of RNN cell instances.
+        cell(RNNCell): An instance of `RNNCell`.
+        is_reverse (bool, optional): Indicate whether to calculate in the reverse
+            order of input sequences. Default: `False`.
+        time_major (bool, optional): Indicate the data layout of Tensor included
+            in `input` and `output` tensors. If `False`, the data layout would
+            be batch major with shape `[batch_size, sequence_length, ...]`.  If
+            `True`, the data layout would be time major with shape
+            `[sequence_length, batch_size, ...]`. Default: `False`.
 
     Examples:
         .. code-block:: python
-            from paddle import LSTMCell, StackedRNNCell
-            cells = [LSTMCell(32, 32), LSTMCell(32, 32)]
-            stack_rnn = StackedRNNCell(cells)
+            import paddle
+            inputs = paddle.rand((2, 4, 32))
+            cell = paddle.StackedLSTMCell(input_size=32, hidden_size=64)
+            rnn = paddle.RNN(cell=cell)
+            outputs, _ = rnn(inputs)  # [2, 4, 64]
     """
 
-    def __init__(self, cells):
-        super(StackedRNNCell, self).__init__()
-        self.cells = []
-        for i, cell in enumerate(cells):
-            self.cells.append(self.add_sublayer("cell_%d" % i, cell))
+    def __init__(self, cell, is_reverse=False, time_major=False):
+        super(RNN, self).__init__()
+        self.cell = cell
+        if not hasattr(self.cell, "call"):
+            # for non-dygraph mode, `rnn` api uses cell.call
+            self.cell.call = self.cell.forward
+        self.is_reverse = is_reverse
+        self.time_major = time_major
+        self.batch_index, self.time_step_index = (1, 0) \
+            if time_major else (0, 1)
 
-    def forward(self, inputs, states, **kwargs):
+    def forward(self, inputs, initial_states=None, sequence_length=None):
         """
-        Performs :code:`cell.forward` for all including cells sequentially.
-        Each cell's `inputs` is the `outputs` of the previous cell. And each
-        cell's `states` is the corresponding one in `states`.
-
+        Performs :code:`cell.forward()` repeatedly until reaches to the maximum
+        length of `inputs`.
         Parameters:
-            inputs (Variable): The inputs for the first cell. Mostly it is a
-                float32 or float64 tensor with shape `[batch_size, input_size]`.
-            states (list): A list containing states for all cells orderly.
-
+            inputs (Variable): A (possibly nested structure of) tensor variable[s]. 
+                The shape of tensor should be `[batch_size, sequence_length, ...]`
+                for `time_major == False` or `[sequence_length, batch_size, ...]`
+                for `time_major == True`. It represents the inputs to be unrolled
+                in RNN.
+            initial_states (Variable, optional): A (possibly nested structure of)
+                tensor variable[s], representing the initial state for RNN. 
+                If not provided, `cell.get_initial_states` would be used to produce
+                the initial state. Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
+            **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`. 
         Returns:
-            tuple: A tuple( :code:`(outputs, new_states)` ). `outputs` is the \
-                `outputs` of the last cell. `new_states` is a list composed \
-                of all cells' `new_states`, and its structure and data type is \
-                same as that of `states` argument.
+            tuple: A tuple( :code:`(final_outputs, final_states)` ) including the final \
+                outputs and states, both are Tensor or nested structure of Tensor. \
+                `final_outputs` has the same structure and data types as \
+                the returned `outputs` of :code:`cell.forward` , and each Tenser in `final_outputs` \
+                stacks all time steps' counterpart in `outputs` thus has shape `[batch_size, sequence_length, ...]` \
+                for `time_major == False` or `[sequence_length, batch_size, ...]` for `time_major == True`. \
+                `final_states` is the counterpart at last time step of initial states, \
+                thus has the same structure with it and has tensors with same shapes \
+                and data types.
         """
-        new_states = []
-        for cell, state in zip(self.cells, states):
-            outputs, new_state = cell(inputs, state, **kwargs)
-            inputs = outputs
-            new_states.append(new_state)
-        return outputs, new_states
+        flat_inputs = flatten(inputs)
+        batch_size, time_steps = (flat_inputs[0].shape[self.batch_index],
+                                  flat_inputs[0].shape[self.time_step_index])
 
-    @staticmethod
-    def stack_param_attr(param_attr, n):
-        """
-        If `param_attr` is a list or tuple, convert every element in it to a
-        ParamAttr instance. Otherwise, repeat `param_attr` `n` times to
-        construct a list, and rename every one by appending a increasing index
-        suffix to avoid having same names when `param_attr` contains a name.
-        Parameters:
-            param_attr (list|tuple|ParamAttr): A list, tuple or something can be
-                converted to a ParamAttr instance by `ParamAttr._to_attr`.
-            n (int): The times to repeat to construct a list when `param_attr`
-                is not a list or tuple.
-        Returns:
-            list: A list composed of each including cell's `param_attr`.
-        """
-        if isinstance(param_attr, (list, tuple)):
-            assert len(param_attr) == n, (
-                "length of param_attr should be %d when it is a list/tuple" % n)
-            param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr]
-        else:
-            param_attrs = []
-            attr = ParamAttr._to_attr(param_attr)
-            for i in range(n):
-                attr_i = copy.deepcopy(attr)
-                if attr.name:
-                    attr_i.name = attr_i.name + "_" + str(i)
-                param_attrs.append(attr_i)
-        return param_attrs
+        if initial_states is None:
+            initial_states = self.cell.get_initial_states(
+                batch_ref=inputs,
+                dtype=inputs.dtype,
+                batch_dim_idx=self.batch_index)
 
-    @property
-    def state_shape(self):
-        """
-        The `state_shape` of StackedRNNCell is a list composed of each including
-        cell's `state_shape`.
-        Returns:
-            list: A list composed of each including cell's `state_shape`.
-        """
-        return [cell.state_shape for cell in self.cells]
+        if fluid.in_dygraph_mode():
 
+            class ArrayWrapper(object):
+                def __init__(self, x):
+                    self.array = [x]
 
-class StackedLSTMCell(RNNCellBase):
-    """
-    Wrapper allowing a stack of LSTM cells to behave as a single cell. It is used
-    to implement stacked LSTM.
+                def append(self, x):
+                    self.array.append(x)
+                    return self
 
-    The formula for LSTM used here is as follows:
+            def _maybe_copy(state, new_state, step_mask):
+                # TODO: use where_op
+                new_state = layers.elementwise_mul(
+                    new_state, step_mask, axis=0) - layers.elementwise_mul(
+                        state, (step_mask - 1), axis=0)
+                return new_state
 
-    .. math::
-        i_{t} & = \sigma(W_{x_{i}}x_{t} + b_{x_{i}} + W_{h_{i}}h_{t-1} + b_{h_{i}})
-        f_{t} & = \sigma(W_{x_{f}}x_{t} + b_{x_{f}} + W_{h_{f}}h_{t-1} + b_{h_{f}})
-        o_{t} & = \sigma(W_{x_{o}}x_{t} + b_{x_{o}} + W_{h_{o}}h_{t-1} + b_{h_{o}})
-        c_{t} & = f_{t}c_{t-1} + i_{t} \\tanh (W_{x_{c}}x_{t} + b_{x_{c}} + W_{h_{c}}h_{t-1} + b_{h_{c}})
-        h_{t} & = o_{t} \\tanh (c_{t})
+            if not self.time_major:
+                inputs = map_structure(
+                    lambda x: layers.transpose(x, [1, 0] + list(
+                        range(2, len(x.shape)))), inputs)
 
-    Parameters:
-        input_size (int): The input size for the first LSTM cell.
-        hidden_size (int): The hidden size for every LSTM cell.
-        num_layers(int, optional): The number of LSTM to be stacked. Default 1.
-        dropout(float, optional): The dropout probability applied on the outputs
-            of each LSTM cell except the last one. 0 for no dropout. Default 0.0
-        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
-            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
-            a list or tuple, it's length must equal to `num_layers`. Otherwise,
-            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
-            Default None.
-        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
-            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
-            a list or tuple, it's length must equal to `num_layers`. Otherwise,
-            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
-            Default None.
-        dtype(string, optional): The data type used in this cell. It can be
-            float32 or float64. Default float32.
+            if sequence_length is not None:
+                mask = layers.sequence_mask(
+                    sequence_length, maxlen=time_steps, dtype=inputs.dtype)
+                mask = layers.transpose(mask, [1, 0])
 
-    Examples:
-        .. code-block:: python
-            import paddle
-            inputs = paddle.rand((2, 4, 32))
-            cell = paddle.StackedLSTMCell(input_size=32, hidden_size=64)
-            rnn = paddle.RNN(cell=cell)
-            outputs, _ = rnn(inputs)  # [2, 4, 64]
-    """
+            if self.is_reverse:
+                inputs = map_structure(lambda x: layers.reverse(x, axis=[0]),
+                                       inputs)
+                mask = layers.reverse(
+                    mask, axis=[0]) if sequence_length is not None else None
 
-    def __init__(self,
-                 input_size,
-                 hidden_size,
-                 num_layers=1,
-                 dropout=0.0,
-                 param_attr=None,
-                 bias_attr=None,
-                 dtype="float32"):
-        super(StackedLSTMCell, self).__init__()
-        self.hidden_size = hidden_size
-        self.input_size = input_size
-        self.num_layers = num_layers
-        self.dropout = dropout
-        param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers)
-        bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers)
+            states = initial_states
+            outputs = []
+            for i in range(time_steps):
+                step_inputs = map_structure(lambda x: x[i], inputs)
+                step_outputs, new_states = self.cell(step_inputs, states)
+                if sequence_length is not None:
+                    new_states = map_structure(
+                        partial(
+                            _maybe_copy, step_mask=mask[i]),
+                        states,
+                        new_states)
+                states = new_states
+                outputs = map_structure(
+                    lambda x: ArrayWrapper(x),
+                    step_outputs) if i == 0 else map_structure(
+                        lambda x, x_array: x_array.append(x), step_outputs,
+                        outputs)
 
-        self.cells = []
-        for i in range(num_layers):
-            self.cells.append(
-                self.add_sublayer(
-                    "lstm_%d" % i,
-                    LSTMCell(
-                        input_size=input_size if i == 0 else hidden_size,
-                        hidden_size=hidden_size,
-                        param_attr=param_attrs[i],
-                        bias_attr=bias_attrs[i],
-                        dtype=dtype)))
+            final_outputs = map_structure(
+                lambda x: layers.stack(x.array, axis=self.time_step_index),
+                outputs)
 
-    def forward(self, inputs, states):
-        """
-        Performs the stacked LSTM cells sequentially. Each cell's `inputs` is
-        the `outputs` of the previous cell. And each cell's `states` is the
-        corresponding one in `states`.
+            if self.is_reverse:
+                final_outputs = map_structure(
+                    lambda x: layers.reverse(x, axis=self.time_step_index),
+                    final_outputs)
 
-        Parameters:
-            inputs (Variable): The inputs for the first cell. It is a float32 or
-                float64 tensor with shape `[batch_size, input_size]`.
-            states (list): A list containing states for all cells orderly.
+            final_states = new_states
+        else:
+            final_outputs, final_states = layers.rnn(
+                self.cell,
+                inputs,
+                initial_states=initial_states,
+                sequence_length=sequence_length,
+                time_major=self.time_major,
+                is_reverse=self.is_reverse)
+        return final_outputs, final_states
 
-        Returns:
-            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
-                a tensor with shape `[batch_size, hidden_size]`, corresponding \
-                to :math:`h_{t}` in the formula of the last LSTM; `new_states` \
-                is a list composed of every LSTM `new_states` which is a pair \
-                of tensors standing for :math:`h_{t}, c_{t}` in the formula, \
-                and the data type and structure of these tensors all is same \
-                as that of `states`.
-        """
-        new_states = []
-        for i, cell in enumerate(self.cells):
-            outputs, new_state = cell(inputs, states[i])
-            outputs = layers.dropout(
-                outputs,
-                self.dropout,
-                dropout_implementation='upscale_in_train'
-            ) if self.dropout and i != (self.num_layers - 1) else outputs
-            inputs = outputs
-            new_states.append(new_state)
-        # TODO(guosheng): maybe should stack list of states as one tensor
-        return outputs, new_states
 
-    @property
-    def state_shape(self):
+class BidirectionalRNN(Layer):
+    """
+    Wrapper for bidirectional RNN. It assembles two RNNCell instances to perform
+    forward and backward RNN separately, and merge outputs of these two RNN
+    according to `merge_mode`.
+    Parameters:
+        cell_fw (RNNCell): A RNNCell instance used for forward RNN.
+        cell_bw (RNNCell): A RNNCell instance used for backward RNN.
+        merge_mode (str|None, optional): The way to merget outputs of forward and
+            backward RNN. It can be `concat`, `sum`, `ave`, `mul`, `zip` and None,
+            where None stands for make the two `outputs` as a tuple, `zip` stands
+            for make each two corresponding tensors of the two `outputs` as a tuple.
+            Default `concat`
+    Examples:
+        .. code-block:: python
+            import paddle
+            from paddle.incubate.hapi.text import StackedLSTMCell, BidirectionalRNN
+            inputs = paddle.rand((2, 4, 32))
+            cell_fw = StackedLSTMCell(32, 64)
+            cell_bw = StackedLSTMCell(32, 64)
+            bi_rnn = BidirectionalRNN(cell_fw, cell_bw)
+            outputs, _ = bi_rnn(inputs)  # [2, 4, 128]
+    """
+
+    def __init__(self,
+                 cell_fw,
+                 cell_bw,
+                 merge_mode='concat',
+                 time_major=False,
+                 cell_cls=None,
+                 **kwargs):
+        super(BidirectionalRNN, self).__init__()
+        self.rnn_fw = RNN(cell_fw, is_reverse=False, time_major=time_major)
+        self.rnn_bw = RNN(cell_bw, is_reverse=True, time_major=time_major)
+        if merge_mode == 'concat':
+            self.merge_func = lambda x, y: layers.concat([x, y], -1)
+        elif merge_mode == 'sum':
+            self.merge_func = lambda x, y: layers.elementwise_add(x, y)
+        elif merge_mode == 'ave':
+            self.merge_func = lambda x, y: layers.scale(
+                layers.elementwise_add(x, y), 0.5)
+        elif merge_mode == 'mul':
+            self.merge_func = lambda x, y: layers.elementwise_mul(x, y)
+        elif merge_mode == 'zip':
+            self.merge_func = lambda x, y: (x, y)
+        elif merge_mode is None:
+            self.merge_func = None
+        else:
+            raise ValueError('Unsupported value for `merge_mode`: %s' %
+                             merge_mode)
+
+    def forward(self,
+                inputs,
+                initial_states=None,
+                sequence_length=None,
+                **kwargs):
         """
-        The `state_shape` of StackedLSTMCell is a list composed of each including
-        LSTM cell's `state_shape`.
+        Performs forward and backward RNN separately, and merge outputs of these
+        two RNN according to `merge_mode`.
+        Parameters:
+            inputs (Variable): A (possibly nested structure of) tensor variable[s]. 
+                The shape of tensor should be `[batch_size, sequence_length, ...]`
+                for `time_major == False` or `[sequence_length, batch_size, ...]`
+                for `time_major == True`. It represents the inputs to be unrolled
+                in both forward and backward RNN.
+            initial_states (Variable|list|tuple): If it is a list or tuple, its
+                length should be 2 to include initial states of forward and backward
+                RNN separately. Otherwise it would be used twice for the two RNN. 
+                If None, `cell.get_initial_states` would be used to produce the initial
+                states. Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
+            **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`.
         Returns:
-            list: A list composed of each including LSTM cell's `state_shape`.
+            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
+                is produced by merge outputs of forward and backward RNN according \
+                to `merge_mode`; similarly, `final_states` is produced by merge \
+                `final_states` of forward and backward RNN.
         """
-        return [cell.state_shape for cell in self.cells]
+        if isinstance(initial_states, (list, tuple)):
+            assert len(
+                initial_states
+            ) == 2, "length of initial_states should be 2 when it is a list/tuple"
+        else:
+            initial_states = [initial_states, initial_states]
+        outputs_fw, states_fw = self.rnn_fw(inputs, initial_states[0],
+                                            sequence_length, **kwargs)
+        outputs_bw, states_bw = self.rnn_bw(inputs, initial_states[1],
+                                            sequence_length, **kwargs)
+        outputs = map_structure(self.merge_func, outputs_fw,
+                                outputs_bw) if self.merge_func else (outputs_fw,
+                                                                     outputs_bw)
+        final_states = map_structure(
+            self.merge_func, states_fw,
+            states_bw) if self.merge_func else (states_fw, states_bw)
+        return outputs, final_states
 
+    @staticmethod
+    def bidirect_param_attr(param_attr):
+        """
+        Converts `param_attr` to a pair of `param_attr` when it is not a list
+        or tuple with length 2, also rename every one by appending a suffix to
+        avoid having same names when `param_attr` contains a name.
 
-class StackedGRUCell(RNNCellBase):
-    """
-    Wrapper allowing a stack of GRU cells to behave as a single cell. It is used
-    to implement stacked GRU.
+        Parameters:
+            param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+                converted to a ParamAttr instance by `ParamAttr._to_attr`. When
+                it is a list or tuple, its length must be 2.
 
-    The formula for GRU used here is as follows:
+        Returns:
+            list: A pair composed of forward and backward RNN cell's `param_attr`.
+        """
+        if isinstance(param_attr, (list, tuple)):
+            assert len(
+                param_attr
+            ) == 2, "length of param_attr should be 2 when it is a list/tuple"
+            param_attrs = param_attr
+        else:
+            param_attrs = []
+            attr = ParamAttr._to_attr(param_attr)
+            attr_fw = copy.deepcopy(attr)
+            if attr.name:
+                attr_fw.name = attr_fw.name + "_fw"
+            param_attrs.append(attr_fw)
+            attr_bw = copy.deepcopy(attr)
+            if attr.name:
+                attr_bw.name = attr_bw.name + "_bw"
+            param_attrs.append(attr_bw)
+        return param_attrs
 
-    .. math::
 
-        u_t & = \sigma(W_{x_{u}}x_{t} + b_{x_{u}} + W_{h_{u}}h_{t-1} + b_{h_{u}})
+class SimpleRNN(Layer):
+    pass
 
-        r_t & = \sigma(W_{x_{r}}x_{t} + b_{x_{r}} + W_{h_{r}}h_{t-1} + b_{h_{r}})
 
-        \\tilde{h_t} & = \\tanh(W_{x_{c}}x_{t} + r_t \odot (W_{h_{c}}h_{t-1} + b_{h_{c}})
+class LSTM(Layer):
+    """
+    Applies a stacked multi-layer long short-term memory (LSTM) RNN to an input
+    sequence.
 
-        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
+    The formula for LSTM used here is as follows:
 
+    .. math::
+        i_{t} & = \sigma(W_{x_{i}}x_{t} + b_{x_{i}} + W_{h_{i}}h_{t-1} + b_{h_{i}})
+        f_{t} & = \sigma(W_{x_{f}}x_{t} + b_{x_{f}} + W_{h_{f}}h_{t-1} + b_{h_{f}})
+        o_{t} & = \sigma(W_{x_{o}}x_{t} + b_{x_{o}} + W_{h_{o}}h_{t-1} + b_{h_{o}})
+        c_{t} & = f_{t}c_{t-1} + i_{t} \\tanh (W_{x_{c}}x_{t} + b_{x_{c}} + W_{h_{c}}h_{t-1} + b_{h_{c}})
+        h_{t} & = o_{t} \\tanh (c_{t})
 
     Parameters:
-        input_size (int): The input size for the first GRU cell.
-        hidden_size (int): The hidden size for every GRU cell.
-        num_layers(int, optional): The number of GRU to be stacked. Default 1.
+        input_size (int): The input feature size for the first LSTM.
+        hidden_size (int): The hidden size for every LSTM.
+        num_layers(int, optional): The number of LSTM to be stacked. Default 1.
         dropout(float, optional): The dropout probability applied on the outputs
-            of each GRU cell except the last one. 0 for no dropout. Default 0.0
-        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
-            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
-            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            of each LSTM except the last one. 0 for not dropout. Default 0.0
+        direction (str, optional): Indicate the direction for LSTM calculation
+            applying on the input sequences. It can be `forward`, `backward` or
+            `bidirect`. If it is `backward`, calculate in the reverse order of
+            input sequences. If it is `bidirect`, each layer would be a 
+            bidirectional LSTM composed of a `forward` LSTM and `backward` LSTM,
+            and it concatenates their outputs as outputs. Default: `forward`.
+        time_major (bool, optional): Indicate the data layout of Tensor included
+            in `input` and `output` tensors. If `False`, the data layout would
+            be batch major with shape `[batch_size, sequence_length, ...]`.  If
+            `True`, the data layout would be time major with shape
+            `[sequence_length, batch_size, ...]`. Default: `False`.
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
             construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
             Default None.
         bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
@@ -675,413 +757,339 @@ class StackedGRUCell(RNNCellBase):
             Default None.
         dtype(string, optional): The data type used in this cell. It can be
             float32 or float64. Default float32.
-
     Examples:
-
         .. code-block:: python
-
             import paddle
-
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import LSTM
             inputs = paddle.rand((2, 4, 32))
-            cell = paddle.StackedGRUCell(input_size=32, hidden_size=64)
-            rnn = paddle.RNN(cell=cell)
-            outputs, _ = rnn(inputs)  # [2, 4, 64]
+            lstm = LSTM(input_size=32, hidden_size=64, num_layers=2)
+            outputs, _ = lstm(inputs)  # [2, 4, 64]
     """
 
     def __init__(self,
                  input_size,
                  hidden_size,
                  num_layers=1,
+                 direction="forward",
                  dropout=0.0,
-                 param_attr=None,
-                 bias_attr=None,
-                 dtype="float32"):
-        super(StackedGRUCell, self).__init__()
-        self.hidden_size = hidden_size
+                 time_major=False,
+                 name=None):
+        super(LSTM, self).__init__()
         self.input_size = input_size
+        self.hidden_size = hidden_size
         self.num_layers = num_layers
         self.dropout = dropout
-        param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers)
-        bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers)
+        self.direction = direction
+        self.num_directions = 2 if direction == 'bidirect' else 1
+        self.time_major = time_major
 
-        self.cells = []
-        for i in range(num_layers):
-            self.cells.append(
-                self.add_sublayer(
-                    "gru_%d" % i,
-                    GRUCell(
-                        input_size=input_size if i == 0 else hidden_size,
-                        hidden_size=hidden_size,
-                        param_attr=param_attrs[i],
-                        bias_attr=bias_attrs[i],
-                        dtype=dtype)))
+        if direction == 'bidirect':
+            param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr)
+            bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr)
+            fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0],
+                                                             num_layers)
+            bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1],
+                                                             num_layers)
+            fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0],
+                                                            num_layers)
+            bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1],
+                                                            num_layers)
 
-    def forward(self, inputs, states):
-        """
-        Performs the stacked GRU cells sequentially. Each cell's `inputs` is
-        the `outputs` of the previous cell. And each cell's `states` is the
-        corresponding one in `states`.
+            # maybe design cell including both forward and backward later
+            merge_mode = 'concat'
+            rnns = []
+            for i in range(num_layers):
+                cell_fw = StackedLSTMCell(input_size if i == 0 else (
+                    hidden_size * 2 if merge_mode == 'concat' else
+                    hidden_size), hidden_size, 1, dropout, fw_param_attrs[i],
+                                          fw_bias_attrs[i], dtype)
+                cell_bw = StackedLSTMCell(input_size if i == 0 else (
+                    hidden_size * 2 if merge_mode == 'concat' else
+                    hidden_size), hidden_size, 1, dropout, bw_param_attrs[i],
+                                          bw_bias_attrs[i], dtype)
+                rnns.append(
+                    BidirectionalRNN(
+                        cell_fw,
+                        cell_bw,
+                        merge_mode=merge_mode,
+                        time_major=time_major))
+            self.lstm = LayerList(rnns)
+        else:
+            lstm_cell = StackedLSTMCell(input_size, hidden_size, num_layers,
+                                        dropout, param_attr, bias_attr, dtype)
+            self.lstm = RNN(lstm_cell,
+                            is_reverse=(direction == "backward"),
+                            time_major=time_major)
 
+    def forward(self, input, initial_states=None, sequence_length=None):
+        """
+        Performs the stacked multi-layer LSTM layer by layer. Each LSTM's `outputs`
+        is the `inputs` of the subsequent one.
         Parameters:
-            inputs (Variable): The inputs for the first cell. It is a float32 or
-                float64 tensor with shape `[batch_size, input_size]`.
-            states (list): A list containing states for all cells orderly.
-
+            inputs (Variable): The inputs for the first LSTM. It is a float32
+                or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
+            initial_states (list|None, optional): A list containing initial states 
+                of all stacked LSTM, and the initial states of each LSTM is a pair
+                of tensors shaped `[batch_size, hidden_size]`. If not provided,
+                use 0 as initial states. Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
         Returns:
-            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
-                a tensor with shape `[batch_size, hidden_size]`, corresponding \
-                to :math:`h_{t}` in the formula of the last GRU; `new_states` \
-                is a list composed of every GRU `new_states` which is also \
-                :math:`h_{t}` in the formula, and the data type and structure \
-                of these tensors all is same as that of `states`.
+            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
+                is the output of last LSTM and it is a tensor with shape \
+                `[batch_size, sequence_length, hidden_size]` and has the same \
+                data type as `inputs`, `final_states` is the counterpart of \
+                `initial_states` at last time step, thus has the same structure \
+                with it and has tensors with same shapes data types. 
         """
-        new_states = []
-        for i, cell in enumerate(self.cells):
-            outputs, new_state = cell(inputs, states[i])
-            outputs = layers.dropout(
-                outputs,
-                self.dropout,
-                dropout_implementation='upscale_in_train'
-            ) if self.dropout and i != (self.num_layers - 1) else outputs
-            inputs = outputs
-            new_states.append(new_state)
-        return outputs, new_states
+        if not isinstance(self.lstm, LayerList):
+            return self.lstm(input, initial_states, sequence_length)
+        else:
+            if isinstance(initial_states, (list, tuple)):
+                assert len(initial_states) == self.num_layers, (
+                    "length of initial_states should be %d when it is a list|tuple"
+                    % self.num_layers)
+            else:
+                initial_states = [initial_states] * self.num_layers
+            stacked_states = []
+            for i in range(self.num_layers):
+                output, states = self.lstm[i](input, initial_states[i],
+                                              sequence_length)
+                input = output
+                stacked_states.append(states)
+            return output, stacked_states
 
-    @property
-    def state_shape(self):
-        """
-        The `state_shape` of StackedGRUCell is a list composed of each including
-        GRU cell's `state_shape`.
 
-        Returns:
-            list: A list composed of each including GRU cell's `state_shape`.
-        """
-        return [cell.state_shape for cell in self.cells]
+class GRU(Layer):
+    pass
 
 
-class RNN(Layer):
+# TODO: restucture RNN layers
+class StackedRNNCell(RNNCellBase):
     """
-    RNN creates a recurrent neural network specified by RNNCell `cell`, which
-    performs :code:`cell.forward()` repeatedly until reaches to the maximum
-    length of `inputs`.
+    Wrapper allowing a stack of RNN cells to behave as a single cell. It is used
+    to implement stacked RNNs.
 
     Parameters:
-        cell(RNNCell): An instance of `RNNCell`.
-        is_reverse (bool, optional): Indicate whether to calculate in the reverse
-            order of input sequences. Default: `False`.
-        time_major (bool, optional): Indicate the data layout of Tensor included
-            in `input` and `output` tensors. If `False`, the data layout would
-            be batch major with shape `[batch_size, sequence_length, ...]`.  If
-            `True`, the data layout would be time major with shape
-            `[sequence_length, batch_size, ...]`. Default: `False`.
+        cells (list|tuple): List of RNN cell instances.
 
     Examples:
         .. code-block:: python
-            import paddle
-            inputs = paddle.rand((2, 4, 32))
-            cell = paddle.StackedLSTMCell(input_size=32, hidden_size=64)
-            rnn = paddle.RNN(cell=cell)
-            outputs, _ = rnn(inputs)  # [2, 4, 64]
+            from paddle import LSTMCell, StackedRNNCell
+            cells = [LSTMCell(32, 32), LSTMCell(32, 32)]
+            stack_rnn = StackedRNNCell(cells)
     """
 
-    def __init__(self, cell, is_reverse=False, time_major=False):
-        super(RNN, self).__init__()
-        self.cell = cell
-        if not hasattr(self.cell, "call"):
-            # for non-dygraph mode, `rnn` api uses cell.call
-            self.cell.call = self.cell.forward
-        self.is_reverse = is_reverse
-        self.time_major = time_major
-        self.batch_index, self.time_step_index = (1, 0) if time_major else (0,
-                                                                            1)
+    def __init__(self, cells):
+        super(StackedRNNCell, self).__init__()
+        self.cells = LayerList(cells)
 
-    def forward(self,
-                inputs,
-                initial_states=None,
-                sequence_length=None,
-                **kwargs):
+    def forward(self, inputs, states):
         """
-        Performs :code:`cell.forward()` repeatedly until reaches to the maximum
-        length of `inputs`.
+        Performs :code:`cell.forward` for all including cells sequentially.
+        Each cell's `inputs` is the `outputs` of the previous cell. And each
+        cell's `states` is the corresponding one in `states`.
+
         Parameters:
-            inputs (Variable): A (possibly nested structure of) tensor variable[s]. 
-                The shape of tensor should be `[batch_size, sequence_length, ...]`
-                for `time_major == False` or `[sequence_length, batch_size, ...]`
-                for `time_major == True`. It represents the inputs to be unrolled
-                in RNN.
-            initial_states (Variable, optional): A (possibly nested structure of)
-                tensor variable[s], representing the initial state for RNN. 
-                If not provided, `cell.get_initial_states` would be used to produce
-                the initial state. Default None.
-            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
-                It stores real length of each instance, thus enables users to extract
-                the last valid state when past a batch element's sequence length for
-                correctness. If not provided, the paddings would be treated same as
-                non-padding inputs. Default None.
-            **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`. 
+            inputs (Variable): The inputs for the first cell. Mostly it is a
+                float32 or float64 tensor with shape `[batch_size, input_size]`.
+            states (list): A list containing states for all cells orderly.
+
         Returns:
-            tuple: A tuple( :code:`(final_outputs, final_states)` ) including the final \
-                outputs and states, both are Tensor or nested structure of Tensor. \
-                `final_outputs` has the same structure and data types as \
-                the returned `outputs` of :code:`cell.forward` , and each Tenser in `final_outputs` \
-                stacks all time steps' counterpart in `outputs` thus has shape `[batch_size, sequence_length, ...]` \
-                for `time_major == False` or `[sequence_length, batch_size, ...]` for `time_major == True`. \
-                `final_states` is the counterpart at last time step of initial states, \
-                thus has the same structure with it and has tensors with same shapes \
-                and data types.
+            tuple: A tuple( :code:`(outputs, new_states)` ). `outputs` is the \
+                `outputs` of the last cell. `new_states` is a list composed \
+                of all cells' `new_states`, and its structure and data type is \
+                same as that of `states` argument.
         """
-        flat_inputs = flatten(inputs)
-        batch_size, time_steps = (flat_inputs[0].shape[self.batch_index],
-                                  flat_inputs[0].shape[self.time_step_index])
-
-        if initial_states is None:
-            initial_states = self.cell.get_initial_states(
-                batch_ref=inputs,
-                dtype=self.cell.dtype if hasattr(self.cell, "dtype") else
-                self.cell.parameters()[0].dtype,
-                batch_dim_idx=self.batch_index)
-
-        if fluid.in_dygraph_mode():
-
-            class ArrayWrapper(object):
-                def __init__(self, x):
-                    self.array = [x]
-
-                def append(self, x):
-                    self.array.append(x)
-                    return self
-
-            def _maybe_copy(state, new_state, step_mask):
-                # TODO: use where_op
-                new_state = layers.elementwise_mul(
-                    new_state, step_mask, axis=0) - layers.elementwise_mul(
-                        state, (step_mask - 1), axis=0)
-                return new_state
-
-            if not self.time_major:
-                inputs = map_structure(
-                    lambda x: layers.transpose(x, [1, 0] + list(
-                        range(2, len(x.shape)))), inputs)
-
-            if sequence_length is not None:
-                mask = layers.sequence_mask(
-                    sequence_length,
-                    maxlen=time_steps,
-                    dtype=flatten(initial_states)[0].dtype)
-                mask = layers.transpose(mask, [1, 0])
+        new_states = []
+        for cell, state in zip(self.cells, states):
+            outputs, new_state = cell(inputs, state)
+            inputs = outputs
+            new_states.append(new_state)
+        return outputs, new_states
 
-            if self.is_reverse:
-                inputs = map_structure(lambda x: layers.reverse(x, axis=[0]),
-                                       inputs)
-                mask = layers.reverse(
-                    mask, axis=[0]) if sequence_length is not None else None
+    @staticmethod
+    def stack_param_attr(param_attr, n):
+        """
+        If `param_attr` is a list or tuple, convert every element in it to a
+        ParamAttr instance. Otherwise, repeat `param_attr` `n` times to
+        construct a list, and rename every one by appending a increasing index
+        suffix to avoid having same names when `param_attr` contains a name.
+        Parameters:
+            param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+                converted to a ParamAttr instance by `ParamAttr._to_attr`.
+            n (int): The times to repeat to construct a list when `param_attr`
+                is not a list or tuple.
+        Returns:
+            list: A list composed of each including cell's `param_attr`.
+        """
+        if isinstance(param_attr, (list, tuple)):
+            assert len(param_attr) == n, (
+                "length of param_attr should be %d when it is a list/tuple" % n)
+            param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr]
+        else:
+            param_attrs = []
+            attr = ParamAttr._to_attr(param_attr)
+            for i in range(n):
+                attr_i = copy.deepcopy(attr)
+                if attr.name:
+                    attr_i.name = attr_i.name + "_" + str(i)
+                param_attrs.append(attr_i)
+        return param_attrs
 
-            states = initial_states
-            outputs = []
-            for i in range(time_steps):
-                step_inputs = map_structure(lambda x: x[i], inputs)
-                step_outputs, new_states = self.cell(step_inputs, states,
-                                                     **kwargs)
-                if sequence_length is not None:
-                    new_states = map_structure(
-                        partial(
-                            _maybe_copy, step_mask=mask[i]),
-                        states,
-                        new_states)
-                states = new_states
-                outputs = map_structure(
-                    lambda x: ArrayWrapper(x),
-                    step_outputs) if i == 0 else map_structure(
-                        lambda x, x_array: x_array.append(x), step_outputs,
-                        outputs)
+    @property
+    def state_shape(self):
+        """
+        The `state_shape` of StackedRNNCell is a list composed of each including
+        cell's `state_shape`.
+        Returns:
+            list: A list composed of each including cell's `state_shape`.
+        """
+        return [cell.state_shape for cell in self.cells]
 
-            final_outputs = map_structure(
-                lambda x: layers.stack(x.array, axis=self.time_step_index),
-                outputs)
 
-            if self.is_reverse:
-                final_outputs = map_structure(
-                    lambda x: layers.reverse(x, axis=self.time_step_index),
-                    final_outputs)
+class StackedLSTMCell(RNNCellBase):
+    """
+    Wrapper allowing a stack of LSTM cells to behave as a single cell. It is used
+    to implement stacked LSTM.
 
-            final_states = new_states
-        else:
-            final_outputs, final_states = layers.rnn(
-                self.cell,
-                inputs,
-                initial_states=initial_states,
-                sequence_length=sequence_length,
-                time_major=self.time_major,
-                is_reverse=self.is_reverse,
-                **kwargs)
-        return final_outputs, final_states
+    The formula for LSTM used here is as follows:
 
+    .. math::
+        i_{t} & = \sigma(W_{x_{i}}x_{t} + b_{x_{i}} + W_{h_{i}}h_{t-1} + b_{h_{i}})
+        f_{t} & = \sigma(W_{x_{f}}x_{t} + b_{x_{f}} + W_{h_{f}}h_{t-1} + b_{h_{f}})
+        o_{t} & = \sigma(W_{x_{o}}x_{t} + b_{x_{o}} + W_{h_{o}}h_{t-1} + b_{h_{o}})
+        c_{t} & = f_{t}c_{t-1} + i_{t} \\tanh (W_{x_{c}}x_{t} + b_{x_{c}} + W_{h_{c}}h_{t-1} + b_{h_{c}})
+        h_{t} & = o_{t} \\tanh (c_{t})
 
-class BidirectionalRNN(Layer):
-    """
-    Wrapper for bidirectional RNN. It assembles two RNNCell instances to perform
-    forward and backward RNN separately, and merge outputs of these two RNN
-    according to `merge_mode`.
     Parameters:
-        cell_fw (RNNCell): A RNNCell instance used for forward RNN.
-        cell_bw (RNNCell): A RNNCell instance used for backward RNN.
-        merge_mode (str|None, optional): The way to merget outputs of forward and
-            backward RNN. It can be `concat`, `sum`, `ave`, `mul`, `zip` and None,
-            where None stands for make the two `outputs` as a tuple, `zip` stands
-            for make each two corresponding tensors of the two `outputs` as a tuple.
-            Default `concat`
+        input_size (int): The input size for the first LSTM cell.
+        hidden_size (int): The hidden size for every LSTM cell.
+        num_layers(int, optional): The number of LSTM to be stacked. Default 1.
+        dropout(float, optional): The dropout probability applied on the outputs
+            of each LSTM cell except the last one. 0 for no dropout. Default 0.0
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+
     Examples:
         .. code-block:: python
             import paddle
-            from paddle.incubate.hapi.text import StackedLSTMCell, BidirectionalRNN
             inputs = paddle.rand((2, 4, 32))
-            cell_fw = StackedLSTMCell(32, 64)
-            cell_bw = StackedLSTMCell(32, 64)
-            bi_rnn = BidirectionalRNN(cell_fw, cell_bw)
-            outputs, _ = bi_rnn(inputs)  # [2, 4, 128]
+            cell = paddle.StackedLSTMCell(input_size=32, hidden_size=64)
+            rnn = paddle.RNN(cell=cell)
+            outputs, _ = rnn(inputs)  # [2, 4, 64]
     """
 
     def __init__(self,
-                 cell_fw,
-                 cell_bw,
-                 merge_mode='concat',
-                 time_major=False,
-                 cell_cls=None,
-                 **kwargs):
-        super(BidirectionalRNN, self).__init__()
-        self.rnn_fw = RNN(cell_fw, is_reverse=False, time_major=time_major)
-        self.rnn_bw = RNN(cell_bw, is_reverse=True, time_major=time_major)
-        if merge_mode == 'concat':
-            self.merge_func = lambda x, y: layers.concat([x, y], -1)
-        elif merge_mode == 'sum':
-            self.merge_func = lambda x, y: layers.elementwise_add(x, y)
-        elif merge_mode == 'ave':
-            self.merge_func = lambda x, y: layers.scale(
-                layers.elementwise_add(x, y), 0.5)
-        elif merge_mode == 'mul':
-            self.merge_func = lambda x, y: layers.elementwise_mul(x, y)
-        elif merge_mode == 'zip':
-            self.merge_func = lambda x, y: (x, y)
-        elif merge_mode is None:
-            self.merge_func = None
-        else:
-            raise ValueError('Unsupported value for `merge_mode`: %s' %
-                             merge_mode)
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 dropout=0.0,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype="float32"):
+        super(StackedLSTMCell, self).__init__()
+        self.hidden_size = hidden_size
+        self.input_size = input_size
+        self.num_layers = num_layers
+        self.dropout = dropout
+        param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers)
+        bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers)
 
-    def forward(self,
-                inputs,
-                initial_states=None,
-                sequence_length=None,
-                **kwargs):
-        """
-        Performs forward and backward RNN separately, and merge outputs of these
-        two RNN according to `merge_mode`.
-        Parameters:
-            inputs (Variable): A (possibly nested structure of) tensor variable[s]. 
-                The shape of tensor should be `[batch_size, sequence_length, ...]`
-                for `time_major == False` or `[sequence_length, batch_size, ...]`
-                for `time_major == True`. It represents the inputs to be unrolled
-                in both forward and backward RNN.
-            initial_states (Variable|list|tuple): If it is a list or tuple, its
-                length should be 2 to include initial states of forward and backward
-                RNN separately. Otherwise it would be used twice for the two RNN. 
-                If None, `cell.get_initial_states` would be used to produce the initial
-                states. Default None.
-            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
-                It stores real length of each instance, thus enables users to extract
-                the last valid state when past a batch element's sequence length for
-                correctness. If not provided, the paddings would be treated same as
-                non-padding inputs. Default None.
-            **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`.
-        Returns:
-            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
-                is produced by merge outputs of forward and backward RNN according \
-                to `merge_mode`; similarly, `final_states` is produced by merge \
-                `final_states` of forward and backward RNN.
-        """
-        if isinstance(initial_states, (list, tuple)):
-            assert len(
-                initial_states
-            ) == 2, "length of initial_states should be 2 when it is a list/tuple"
-        else:
-            initial_states = [initial_states, initial_states]
-        outputs_fw, states_fw = self.rnn_fw(inputs, initial_states[0],
-                                            sequence_length, **kwargs)
-        outputs_bw, states_bw = self.rnn_bw(inputs, initial_states[1],
-                                            sequence_length, **kwargs)
-        outputs = map_structure(self.merge_func, outputs_fw,
-                                outputs_bw) if self.merge_func else (outputs_fw,
-                                                                     outputs_bw)
-        final_states = map_structure(
-            self.merge_func, states_fw,
-            states_bw) if self.merge_func else (states_fw, states_bw)
-        return outputs, final_states
+        self.cells = []
+        for i in range(num_layers):
+            self.cells.append(
+                self.add_sublayer(
+                    "lstm_%d" % i,
+                    LSTMCell(
+                        input_size=input_size if i == 0 else hidden_size,
+                        hidden_size=hidden_size,
+                        param_attr=param_attrs[i],
+                        bias_attr=bias_attrs[i],
+                        dtype=dtype)))
 
-    @staticmethod
-    def bidirect_param_attr(param_attr):
+    def forward(self, inputs, states):
         """
-        Converts `param_attr` to a pair of `param_attr` when it is not a list
-        or tuple with length 2, also rename every one by appending a suffix to
-        avoid having same names when `param_attr` contains a name.
+        Performs the stacked LSTM cells sequentially. Each cell's `inputs` is
+        the `outputs` of the previous cell. And each cell's `states` is the
+        corresponding one in `states`.
 
         Parameters:
-            param_attr (list|tuple|ParamAttr): A list, tuple or something can be
-                converted to a ParamAttr instance by `ParamAttr._to_attr`. When
-                it is a list or tuple, its length must be 2.
+            inputs (Variable): The inputs for the first cell. It is a float32 or
+                float64 tensor with shape `[batch_size, input_size]`.
+            states (list): A list containing states for all cells orderly.
 
         Returns:
-            list: A pair composed of forward and backward RNN cell's `param_attr`.
+            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
+                a tensor with shape `[batch_size, hidden_size]`, corresponding \
+                to :math:`h_{t}` in the formula of the last LSTM; `new_states` \
+                is a list composed of every LSTM `new_states` which is a pair \
+                of tensors standing for :math:`h_{t}, c_{t}` in the formula, \
+                and the data type and structure of these tensors all is same \
+                as that of `states`.
         """
-        if isinstance(param_attr, (list, tuple)):
-            assert len(
-                param_attr
-            ) == 2, "length of param_attr should be 2 when it is a list/tuple"
-            param_attrs = param_attr
-        else:
-            param_attrs = []
-            attr = ParamAttr._to_attr(param_attr)
-            attr_fw = copy.deepcopy(attr)
-            if attr.name:
-                attr_fw.name = attr_fw.name + "_fw"
-            param_attrs.append(attr_fw)
-            attr_bw = copy.deepcopy(attr)
-            if attr.name:
-                attr_bw.name = attr_bw.name + "_bw"
-            param_attrs.append(attr_bw)
-        return param_attrs
+        new_states = []
+        for i, cell in enumerate(self.cells):
+            outputs, new_state = cell(inputs, states[i])
+            outputs = layers.dropout(
+                outputs,
+                self.dropout,
+                dropout_implementation='upscale_in_train'
+            ) if self.dropout and i != (self.num_layers - 1) else outputs
+            inputs = outputs
+            new_states.append(new_state)
+        # TODO(guosheng): maybe should stack list of states as one tensor
+        return outputs, new_states
+
+    @property
+    def state_shape(self):
+        """
+        The `state_shape` of StackedLSTMCell is a list composed of each including
+        LSTM cell's `state_shape`.
+        Returns:
+            list: A list composed of each including LSTM cell's `state_shape`.
+        """
+        return [cell.state_shape for cell in self.cells]
 
 
-class LSTM(Layer):
+class StackedGRUCell(RNNCellBase):
     """
-    Applies a stacked multi-layer long short-term memory (LSTM) RNN to an input
-    sequence.
+    Wrapper allowing a stack of GRU cells to behave as a single cell. It is used
+    to implement stacked GRU.
 
-    The formula for LSTM used here is as follows:
+    The formula for GRU used here is as follows:
 
     .. math::
-        i_{t} & = \sigma(W_{x_{i}}x_{t} + b_{x_{i}} + W_{h_{i}}h_{t-1} + b_{h_{i}})
-        f_{t} & = \sigma(W_{x_{f}}x_{t} + b_{x_{f}} + W_{h_{f}}h_{t-1} + b_{h_{f}})
-        o_{t} & = \sigma(W_{x_{o}}x_{t} + b_{x_{o}} + W_{h_{o}}h_{t-1} + b_{h_{o}})
-        c_{t} & = f_{t}c_{t-1} + i_{t} \\tanh (W_{x_{c}}x_{t} + b_{x_{c}} + W_{h_{c}}h_{t-1} + b_{h_{c}})
-        h_{t} & = o_{t} \\tanh (c_{t})
+
+        u_t & = \sigma(W_{x_{u}}x_{t} + b_{x_{u}} + W_{h_{u}}h_{t-1} + b_{h_{u}})
+
+        r_t & = \sigma(W_{x_{r}}x_{t} + b_{x_{r}} + W_{h_{r}}h_{t-1} + b_{h_{r}})
+
+        \\tilde{h_t} & = \\tanh(W_{x_{c}}x_{t} + r_t \odot (W_{h_{c}}h_{t-1} + b_{h_{c}})
+
+        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
+
 
     Parameters:
-        input_size (int): The input feature size for the first LSTM.
-        hidden_size (int): The hidden size for every LSTM.
-        num_layers(int, optional): The number of LSTM to be stacked. Default 1.
+        input_size (int): The input size for the first GRU cell.
+        hidden_size (int): The hidden size for every GRU cell.
+        num_layers(int, optional): The number of GRU to be stacked. Default 1.
         dropout(float, optional): The dropout probability applied on the outputs
-            of each LSTM except the last one. 0 for not dropout. Default 0.0
-        direction (str, optional): Indicate the direction for LSTM calculation
-            applying on the input sequences. It can be `forward`, `backward` or
-            `bidirect`. If it is `backward`, calculate in the reverse order of
-            input sequences. If it is `bidirect`, each layer would be a 
-            bidirectional LSTM composed of a `forward` LSTM and `backward` LSTM,
-            and it concatenates their outputs as outputs. Default: `forward`.
-        time_major (bool, optional): Indicate the data layout of Tensor included
-            in `input` and `output` tensors. If `False`, the data layout would
-            be batch major with shape `[batch_size, sequence_length, ...]`.  If
-            `True`, the data layout would be time major with shape
-            `[sequence_length, batch_size, ...]`. Default: `False`.
+            of each GRU cell except the last one. 0 for no dropout. Default 0.0
         param_attr (list|tuple|ParamAttr): A list, tuple or something can be
             converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
             a list or tuple, it's length must equal to `num_layers`. Otherwise,
@@ -1094,14 +1102,17 @@ class LSTM(Layer):
             Default None.
         dtype(string, optional): The data type used in this cell. It can be
             float32 or float64. Default float32.
+
     Examples:
+
         .. code-block:: python
+
             import paddle
-            import paddle.fluid as fluid
-            from paddle.incubate.hapi.text import LSTM
+
             inputs = paddle.rand((2, 4, 32))
-            lstm = LSTM(input_size=32, hidden_size=64, num_layers=2)
-            outputs, _ = lstm(inputs)  # [2, 4, 64]
+            cell = paddle.StackedGRUCell(input_size=32, hidden_size=64)
+            rnn = paddle.RNN(cell=cell)
+            outputs, _ = rnn(inputs)  # [2, 4, 64]
     """
 
     def __init__(self,
@@ -1109,95 +1120,67 @@ def __init__(self,
                  hidden_size,
                  num_layers=1,
                  dropout=0.0,
-                 direction="forward",
-                 time_major=False,
                  param_attr=None,
                  bias_attr=None,
-                 dtype='float32'):
-        super(LSTM, self).__init__()
-        self.input_size = input_size
+                 dtype="float32"):
+        super(StackedGRUCell, self).__init__()
         self.hidden_size = hidden_size
+        self.input_size = input_size
         self.num_layers = num_layers
         self.dropout = dropout
-        self.direction = direction
-        self.num_directions = 2 if direction == 'bidirect' else 1
-        self.time_major = time_major
-
-        if direction == 'bidirect':
-            param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr)
-            bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr)
-            fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0],
-                                                             num_layers)
-            bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1],
-                                                             num_layers)
-            fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0],
-                                                            num_layers)
-            bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1],
-                                                            num_layers)
+        param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers)
+        bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers)
 
-            # maybe design cell including both forward and backward later
-            merge_mode = 'concat'
-            rnns = []
-            for i in range(num_layers):
-                cell_fw = StackedLSTMCell(input_size if i == 0 else (
-                    hidden_size * 2 if merge_mode == 'concat' else
-                    hidden_size), hidden_size, 1, dropout, fw_param_attrs[i],
-                                          fw_bias_attrs[i], dtype)
-                cell_bw = StackedLSTMCell(input_size if i == 0 else (
-                    hidden_size * 2 if merge_mode == 'concat' else
-                    hidden_size), hidden_size, 1, dropout, bw_param_attrs[i],
-                                          bw_bias_attrs[i], dtype)
-                rnns.append(
-                    BidirectionalRNN(
-                        cell_fw,
-                        cell_bw,
-                        merge_mode=merge_mode,
-                        time_major=time_major))
-            self.lstm = LayerList(rnns)
-        else:
-            lstm_cell = StackedLSTMCell(input_size, hidden_size, num_layers,
-                                        dropout, param_attr, bias_attr, dtype)
-            self.lstm = RNN(lstm_cell,
-                            is_reverse=(direction == "backward"),
-                            time_major=time_major)
+        self.cells = []
+        for i in range(num_layers):
+            self.cells.append(
+                self.add_sublayer(
+                    "gru_%d" % i,
+                    GRUCell(
+                        input_size=input_size if i == 0 else hidden_size,
+                        hidden_size=hidden_size,
+                        param_attr=param_attrs[i],
+                        bias_attr=bias_attrs[i],
+                        dtype=dtype)))
 
-    def forward(self, input, initial_states=None, sequence_length=None):
+    def forward(self, inputs, states):
         """
-        Performs the stacked multi-layer LSTM layer by layer. Each LSTM's `outputs`
-        is the `inputs` of the subsequent one.
+        Performs the stacked GRU cells sequentially. Each cell's `inputs` is
+        the `outputs` of the previous cell. And each cell's `states` is the
+        corresponding one in `states`.
+
         Parameters:
-            inputs (Variable): The inputs for the first LSTM. It is a float32
-                or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
-            initial_states (list|None, optional): A list containing initial states 
-                of all stacked LSTM, and the initial states of each LSTM is a pair
-                of tensors shaped `[batch_size, hidden_size]`. If not provided,
-                use 0 as initial states. Default None.
-            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
-                It stores real length of each instance, thus enables users to extract
-                the last valid state when past a batch element's sequence length for
-                correctness. If not provided, the paddings would be treated same as
-                non-padding inputs. Default None.
+            inputs (Variable): The inputs for the first cell. It is a float32 or
+                float64 tensor with shape `[batch_size, input_size]`.
+            states (list): A list containing states for all cells orderly.
+
         Returns:
-            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
-                is the output of last LSTM and it is a tensor with shape \
-                `[batch_size, sequence_length, hidden_size]` and has the same \
-                data type as `inputs`, `final_states` is the counterpart of \
-                `initial_states` at last time step, thus has the same structure \
-                with it and has tensors with same shapes data types. 
+            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
+                a tensor with shape `[batch_size, hidden_size]`, corresponding \
+                to :math:`h_{t}` in the formula of the last GRU; `new_states` \
+                is a list composed of every GRU `new_states` which is also \
+                :math:`h_{t}` in the formula, and the data type and structure \
+                of these tensors all is same as that of `states`.
         """
-        if not isinstance(self.lstm, LayerList):
-            return self.lstm(input, initial_states, sequence_length)
-        else:
-            if isinstance(initial_states, (list, tuple)):
-                assert len(initial_states) == self.num_layers, (
-                    "length of initial_states should be %d when it is a list|tuple"
-                    % self.num_layers)
-            else:
-                initial_states = [initial_states] * self.num_layers
-            stacked_states = []
-            for i in range(self.num_layers):
-                output, states = self.lstm[i](input, initial_states[i],
-                                              sequence_length)
-                input = output
-                stacked_states.append(states)
-            return output, stacked_states
+        new_states = []
+        for i, cell in enumerate(self.cells):
+            outputs, new_state = cell(inputs, states[i])
+            outputs = layers.dropout(
+                outputs,
+                self.dropout,
+                dropout_implementation='upscale_in_train'
+            ) if self.dropout and i != (self.num_layers - 1) else outputs
+            inputs = outputs
+            new_states.append(new_state)
+        return outputs, new_states
+
+    @property
+    def state_shape(self):
+        """
+        The `state_shape` of StackedGRUCell is a list composed of each including
+        GRU cell's `state_shape`.
+
+        Returns:
+            list: A list composed of each including GRU cell's `state_shape`.
+        """
+        return [cell.state_shape for cell in self.cells]

From a88fb8887396d45175fa6da55e9800af7f739632 Mon Sep 17 00:00:00 2001
From: chenfeiyu <chenfeiyu@baidu.com>
Date: Tue, 18 Aug 2020 17:56:45 +0800
Subject: [PATCH 03/14] add new progresses in rnn APIs for 2.0

---
 python/paddle/fluid/layers/rnn.py |  99 ++++-
 python/paddle/nn/layer/rnn.py     | 658 ++++++++++++++++++------------
 2 files changed, 485 insertions(+), 272 deletions(-)

diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index ecc5876852283..6260c5684488a 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -492,6 +492,95 @@ def rnn(cell,
             cell = fluid.layers.GRUCell(hidden_size=128)
             outputs = fluid.layers.rnn(cell=cell, inputs=inputs)
     """
+    if in_dygraph_mode:
+        return _rnn_dynamic_graph(cell, inputs, initial_states, sequence_length,
+                                  time_major, is_reverse, **kwargs)
+    else:
+        return _rnn_static_graph(cell, inputs, initial_states, sequence_length,
+                                 time_major, is_reverse, **kwargs)
+
+
+class ArrayWrapper(object):
+    def __init__(self, x):
+        self.array = [x]
+
+    def append(self, x):
+        self.array.append(x)
+        return self
+
+
+def _maybe_copy(state, new_state, step_mask):
+    """update rnn state or just pass the old state through"""
+    new_state = nn.elementwise_mul(new_state, step_mask, axis=0) \
+              + nn.elementwise_mul(state, (1 - step_mask), axis=0)
+    return new_state
+
+
+def _transpose_batch_time(x):
+    perm = [1, 0] + list(range(2, len(x.shape)))
+    return nn.transpose(x, perm)
+
+
+def _rnn_dynamic_graph(cell,
+                       inputs,
+                       initial_states=None,
+                       sequence_length=None,
+                       time_major=False,
+                       is_reverse=False,
+                       **kwargs):
+
+    time_step_index = 0 if time_major else 1
+    flat_inputs = flatten(inputs)
+    time_steps = flat_inputs[0].shape[time_step_index]
+
+    if not time_major:
+        inputs = map_structure(_transpose_batch_time, inputs)
+
+    if sequence_length is not None:
+        mask = sequence_lod.sequence_mask(
+            sequence_length, maxlen=time_steps, dtype=inputs.dtype)
+        mask = nn.transpose(mask, [1, 0])
+
+    if is_reverse:
+        inputs = map_structure(lambda x: tensor.reverse(x, axis=[0]), inputs)
+        mask = tensor.reverse(mask, axis=[0]) \
+            if sequence_length is not None else None
+
+    states = initial_states
+    outputs = []
+    for i in range(time_steps):
+        step_inputs = map_structure(lambda x: x[i], inputs)
+        step_outputs, new_states = cell(step_inputs, states, **kwargs)
+        if sequence_length is not None:
+            new_states = map_structure(
+                partial(
+                    _maybe_copy, step_mask=mask[i]), states, new_states)
+        states = new_states
+        outputs = map_structure(lambda x: ArrayWrapper(x),
+                                step_outputs) if i == 0 else map_structure(
+                                    lambda x, x_array: x_array.append(x),
+                                    step_outputs, outputs)
+
+    final_outputs = map_structure(
+        lambda x: nn.stack(x.array, axis=time_step_index),
+        outputs)
+
+    if is_reverse:
+        final_outputs = map_structure(
+            lambda x: tensor.reverse(x, axis=time_step_index),
+            final_outputs)
+
+    final_states = new_states
+    return final_outputs, final_states
+
+
+def _rnn_static_graph(cell,
+                      inputs,
+                      initial_states=None,
+                      sequence_length=None,
+                      time_major=False,
+                      is_reverse=False,
+                      **kwargs):
     check_type(inputs, 'inputs', (Variable, list, tuple), 'rnn')
     if isinstance(inputs, (list, tuple)):
         for i, input_x in enumerate(inputs):
@@ -513,16 +602,6 @@ def rnn(cell,
     check_type(sequence_length, 'sequence_length', (Variable, type(None)),
                'rnn')
 
-    def _maybe_copy(state, new_state, step_mask):
-        # TODO: use where_op
-        new_state = nn.elementwise_mul(
-            new_state, step_mask, axis=0) - nn.elementwise_mul(
-                state, (step_mask - 1), axis=0)
-        return new_state
-
-    def _transpose_batch_time(x):
-        return nn.transpose(x, [1, 0] + list(range(2, len(x.shape))))
-
     def _switch_grad(x, stop=False):
         x.stop_gradient = stop
         return x
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index b64f04b37879c..e93a34f59a09c 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -37,16 +37,66 @@
     'SimpleRNNCell',
     'LSTMCell',
     'GRUCell',
-    'StackedRNNCell',
-    'StackedLSTMCell',
-    # 'stackedGRUCell',
     'RNN',
-    'BidirectionalRNN',
+    'BiRNN',
+    'SimpleRNN',
     'LSTM',
-    # 'GRU',
+    'GRU',
+    # 'StackedRNNCell',
+    # 'StackedLSTMCell',
+    # 'stackedGRUCell',
 ]
 
 
+def split_states(states, bidirectional=False, state_components=1):
+    if state_components == 1:
+        states = layers.unstack(states)
+        if not bidirectional:
+            return states
+        else:
+            return list(zip(states[::2], states[1::2]))
+    else:
+        states = tuple([layers.unstack(item) for item in states])
+        if not bidirectional:
+            return list(zip(*states))
+        else:
+            states = list(zip(*states))
+            return list(zip(states[::2], states[1::2]))
+
+
+def concat_states(states, bidirectional=False, state_components=1):
+    if state_components == 1:
+        return layers.stack(flatten(states))
+    else:
+        states = flatten(states)
+        componnets = []
+        for i in range(state_components):
+            componnets.append(states[i::state_components])
+        return [layers.stack(item) for item in componnets]
+
+
+def birnn(cell_fw, cell_bw, inputs, states_fw, states_bw, sequence_length,
+          time_major):
+    outputs_fw, states_fw = layers.rnn(cell_fw,
+                                       inputs,
+                                       states_fw,
+                                       sequence_length,
+                                       time_major=time_major)
+
+    outputs_bw, states_bw = layers.rnn(cell_bw,
+                                       inputs,
+                                       states_bw,
+                                       sequence_length,
+                                       time_major=time_major,
+                                       is_reverse=True)
+
+    outputs = map_structure(lambda x, y: layers.concat([x, y], -1), outputs_fw,
+                            outputs_bw)
+
+    final_states = (states_fw, states_bw)
+    return outputs, final_states
+
+
 class RNNCellBase(Layer):
     """
     RNNCellBase is the base class for abstraction representing the calculations
@@ -460,8 +510,8 @@ def __init__(self, cell, is_reverse=False, time_major=False):
             self.cell.call = self.cell.forward
         self.is_reverse = is_reverse
         self.time_major = time_major
-        self.batch_index, self.time_step_index = (1, 0) \
-            if time_major else (0, 1)
+        # self.batch_index, self.time_step_index = (1, 0) \
+        #     if time_major else (0, 1)
 
     def forward(self, inputs, initial_states=None, sequence_length=None):
         """
@@ -494,9 +544,6 @@ def forward(self, inputs, initial_states=None, sequence_length=None):
                 thus has the same structure with it and has tensors with same shapes \
                 and data types.
         """
-        flat_inputs = flatten(inputs)
-        batch_size, time_steps = (flat_inputs[0].shape[self.batch_index],
-                                  flat_inputs[0].shape[self.time_step_index])
 
         if initial_states is None:
             initial_states = self.cell.get_initial_states(
@@ -504,79 +551,17 @@ def forward(self, inputs, initial_states=None, sequence_length=None):
                 dtype=inputs.dtype,
                 batch_dim_idx=self.batch_index)
 
-        if fluid.in_dygraph_mode():
-
-            class ArrayWrapper(object):
-                def __init__(self, x):
-                    self.array = [x]
-
-                def append(self, x):
-                    self.array.append(x)
-                    return self
-
-            def _maybe_copy(state, new_state, step_mask):
-                # TODO: use where_op
-                new_state = layers.elementwise_mul(
-                    new_state, step_mask, axis=0) - layers.elementwise_mul(
-                        state, (step_mask - 1), axis=0)
-                return new_state
-
-            if not self.time_major:
-                inputs = map_structure(
-                    lambda x: layers.transpose(x, [1, 0] + list(
-                        range(2, len(x.shape)))), inputs)
-
-            if sequence_length is not None:
-                mask = layers.sequence_mask(
-                    sequence_length, maxlen=time_steps, dtype=inputs.dtype)
-                mask = layers.transpose(mask, [1, 0])
-
-            if self.is_reverse:
-                inputs = map_structure(lambda x: layers.reverse(x, axis=[0]),
-                                       inputs)
-                mask = layers.reverse(
-                    mask, axis=[0]) if sequence_length is not None else None
-
-            states = initial_states
-            outputs = []
-            for i in range(time_steps):
-                step_inputs = map_structure(lambda x: x[i], inputs)
-                step_outputs, new_states = self.cell(step_inputs, states)
-                if sequence_length is not None:
-                    new_states = map_structure(
-                        partial(
-                            _maybe_copy, step_mask=mask[i]),
-                        states,
-                        new_states)
-                states = new_states
-                outputs = map_structure(
-                    lambda x: ArrayWrapper(x),
-                    step_outputs) if i == 0 else map_structure(
-                        lambda x, x_array: x_array.append(x), step_outputs,
-                        outputs)
-
-            final_outputs = map_structure(
-                lambda x: layers.stack(x.array, axis=self.time_step_index),
-                outputs)
-
-            if self.is_reverse:
-                final_outputs = map_structure(
-                    lambda x: layers.reverse(x, axis=self.time_step_index),
-                    final_outputs)
-
-            final_states = new_states
-        else:
-            final_outputs, final_states = layers.rnn(
-                self.cell,
-                inputs,
-                initial_states=initial_states,
-                sequence_length=sequence_length,
-                time_major=self.time_major,
-                is_reverse=self.is_reverse)
+        final_outputs, final_states = layers.rnn(
+            self.cell,
+            inputs,
+            initial_states=initial_states,
+            sequence_length=sequence_length,
+            time_major=self.time_major,
+            is_reverse=self.is_reverse)
         return final_outputs, final_states
 
 
-class BidirectionalRNN(Layer):
+class BiRNN(Layer):
     """
     Wrapper for bidirectional RNN. It assembles two RNNCell instances to perform
     forward and backward RNN separately, and merge outputs of these two RNN
@@ -584,11 +569,7 @@ class BidirectionalRNN(Layer):
     Parameters:
         cell_fw (RNNCell): A RNNCell instance used for forward RNN.
         cell_bw (RNNCell): A RNNCell instance used for backward RNN.
-        merge_mode (str|None, optional): The way to merget outputs of forward and
-            backward RNN. It can be `concat`, `sum`, `ave`, `mul`, `zip` and None,
-            where None stands for make the two `outputs` as a tuple, `zip` stands
-            for make each two corresponding tensors of the two `outputs` as a tuple.
-            Default `concat`
+
     Examples:
         .. code-block:: python
             import paddle
@@ -600,32 +581,11 @@ class BidirectionalRNN(Layer):
             outputs, _ = bi_rnn(inputs)  # [2, 4, 128]
     """
 
-    def __init__(self,
-                 cell_fw,
-                 cell_bw,
-                 merge_mode='concat',
-                 time_major=False,
-                 cell_cls=None,
-                 **kwargs):
-        super(BidirectionalRNN, self).__init__()
-        self.rnn_fw = RNN(cell_fw, is_reverse=False, time_major=time_major)
-        self.rnn_bw = RNN(cell_bw, is_reverse=True, time_major=time_major)
-        if merge_mode == 'concat':
-            self.merge_func = lambda x, y: layers.concat([x, y], -1)
-        elif merge_mode == 'sum':
-            self.merge_func = lambda x, y: layers.elementwise_add(x, y)
-        elif merge_mode == 'ave':
-            self.merge_func = lambda x, y: layers.scale(
-                layers.elementwise_add(x, y), 0.5)
-        elif merge_mode == 'mul':
-            self.merge_func = lambda x, y: layers.elementwise_mul(x, y)
-        elif merge_mode == 'zip':
-            self.merge_func = lambda x, y: (x, y)
-        elif merge_mode is None:
-            self.merge_func = None
-        else:
-            raise ValueError('Unsupported value for `merge_mode`: %s' %
-                             merge_mode)
+    def __init__(self, cell_fw, cell_bw, time_major=False):
+        super(BiRNN, self).__init__()
+        self.cell_fw = cell_fw
+        self.cell_bw = cell_bw
+        self.time_major = time_major
 
     def forward(self,
                 inputs,
@@ -659,21 +619,15 @@ def forward(self,
                 `final_states` of forward and backward RNN.
         """
         if isinstance(initial_states, (list, tuple)):
-            assert len(
-                initial_states
-            ) == 2, "length of initial_states should be 2 when it is a list/tuple"
+            assert len(initial_states) == 2, \
+                "length of initial_states should be 2 when it is a list/tuple"
         else:
             initial_states = [initial_states, initial_states]
-        outputs_fw, states_fw = self.rnn_fw(inputs, initial_states[0],
-                                            sequence_length, **kwargs)
-        outputs_bw, states_bw = self.rnn_bw(inputs, initial_states[1],
-                                            sequence_length, **kwargs)
-        outputs = map_structure(self.merge_func, outputs_fw,
-                                outputs_bw) if self.merge_func else (outputs_fw,
-                                                                     outputs_bw)
-        final_states = map_structure(
-            self.merge_func, states_fw,
-            states_bw) if self.merge_func else (states_fw, states_bw)
+        states_fw, states_bw = initial_states
+
+        outputs, final_states = birnn(self.cell_fw, self.cell_bw, inputs,
+                                      states_fw, states_bw, sequence_length,
+                                      self.time_major)
         return outputs, final_states
 
     @staticmethod
@@ -710,162 +664,342 @@ def bidirect_param_attr(param_attr):
         return param_attrs
 
 
-class SimpleRNN(Layer):
-    pass
+class SimpleRNN(LayerList):
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 nonlinearity="tanh",
+                 direction="forward",
+                 dropout=0.,
+                 time_major=False,
+                 name=None):
+        super(SimpleRNN, self).__init__()
+
+        if direction in ["forward", "backward"]:
+            is_reverse = direction == "backward"
+            cell = SimpleRNNCell(input_size, hidden_size, nonlinearity)
+            self.append(RNN(cell, is_reverse, time_major))
+            for i in range(1, num_layers):
+                cell = SimpleRNNCell(hidden_size, hidden_size, nonlinearity)
+                self.append(RNN(cell, is_reverse, time_major))
+        elif direction == "bidirectional":
+            cell_fw = SimpleRNNCell(input_size, hidden_size, nonlinearity)
+            cell_bw = SimpleRNNCell(input_size, hidden_size, nonlinearity)
+            self.append(BiRNN(cell_fw, cell_bw, time_major))
+            for i in range(1, num_layers):
+                cell_fw = SimpleRNNCell(2 * hidden_size, hidden_size,
+                                        nonlinearity)
+                cell_bw = SimpleRNNCell(2 * hidden_size, hidden_size,
+                                        nonlinearity)
+                self.append(BiRNN(cell_fw, cell_bw, time_major))
+        else:
+            raise ValueError(
+                "direction should be forward, backward or bidirectional, "
+                "received direction = {}".format(direction))
 
+        self.dropout = dropout
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.time_major = time_major
+        self.num_layers = num_layers
 
-class LSTM(Layer):
-    """
-    Applies a stacked multi-layer long short-term memory (LSTM) RNN to an input
-    sequence.
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        batch_index = 1 if self.time_major else 0
+        batch_size = inputs.shape[batch_index] if fluid.in_dygraph_mode() \
+                     else layers.shape(inputs)[batch_index]
+        if initial_states is None:
+            state_shape = (self.num_directions * self.num_layers, batch_size,
+                           self.hidden_size)
+            initial_states = layers.zeros(state_shape, dtype=inputs.dtype)
+
+        states = split_states(initial_states, self.num_directions == 2)
+        final_states = []
+        for i, rnn_layer in enumerate(self):
+            if i > 0:
+                inputs = layers.dropout(
+                    inputs,
+                    self.dropout,
+                    dropout_implementation="upscale_in_train")
+            outputs, final_state = rnn_layer(inputs, states[i], sequence_length)
+            final_states.append(final_state)
+            inputs = outputs
 
-    The formula for LSTM used here is as follows:
+        final_states = concat_states(final_states, self.num_directions == 2)
+        return outputs, final_states
 
-    .. math::
-        i_{t} & = \sigma(W_{x_{i}}x_{t} + b_{x_{i}} + W_{h_{i}}h_{t-1} + b_{h_{i}})
-        f_{t} & = \sigma(W_{x_{f}}x_{t} + b_{x_{f}} + W_{h_{f}}h_{t-1} + b_{h_{f}})
-        o_{t} & = \sigma(W_{x_{o}}x_{t} + b_{x_{o}} + W_{h_{o}}h_{t-1} + b_{h_{o}})
-        c_{t} & = f_{t}c_{t-1} + i_{t} \\tanh (W_{x_{c}}x_{t} + b_{x_{c}} + W_{h_{c}}h_{t-1} + b_{h_{c}})
-        h_{t} & = o_{t} \\tanh (c_{t})
-
-    Parameters:
-        input_size (int): The input feature size for the first LSTM.
-        hidden_size (int): The hidden size for every LSTM.
-        num_layers(int, optional): The number of LSTM to be stacked. Default 1.
-        dropout(float, optional): The dropout probability applied on the outputs
-            of each LSTM except the last one. 0 for not dropout. Default 0.0
-        direction (str, optional): Indicate the direction for LSTM calculation
-            applying on the input sequences. It can be `forward`, `backward` or
-            `bidirect`. If it is `backward`, calculate in the reverse order of
-            input sequences. If it is `bidirect`, each layer would be a 
-            bidirectional LSTM composed of a `forward` LSTM and `backward` LSTM,
-            and it concatenates their outputs as outputs. Default: `forward`.
-        time_major (bool, optional): Indicate the data layout of Tensor included
-            in `input` and `output` tensors. If `False`, the data layout would
-            be batch major with shape `[batch_size, sequence_length, ...]`.  If
-            `True`, the data layout would be time major with shape
-            `[sequence_length, batch_size, ...]`. Default: `False`.
-        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
-            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
-            a list or tuple, it's length must equal to `num_layers`. Otherwise,
-            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
-            Default None.
-        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
-            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
-            a list or tuple, it's length must equal to `num_layers`. Otherwise,
-            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
-            Default None.
-        dtype(string, optional): The data type used in this cell. It can be
-            float32 or float64. Default float32.
-    Examples:
-        .. code-block:: python
-            import paddle
-            import paddle.fluid as fluid
-            from paddle.incubate.hapi.text import LSTM
-            inputs = paddle.rand((2, 4, 32))
-            lstm = LSTM(input_size=32, hidden_size=64, num_layers=2)
-            outputs, _ = lstm(inputs)  # [2, 4, 64]
-    """
 
+class LSTM(LayerList):
     def __init__(self,
                  input_size,
                  hidden_size,
                  num_layers=1,
                  direction="forward",
-                 dropout=0.0,
+                 dropout=0.,
                  time_major=False,
                  name=None):
         super(LSTM, self).__init__()
-        self.input_size = input_size
-        self.hidden_size = hidden_size
-        self.num_layers = num_layers
+
+        if direction in ["forward", "backward"]:
+            is_reverse = direction == "backward"
+            cell = LSTMCell(input_size, hidden_size)
+            self.append(RNN(cell, is_reverse, time_major))
+            for i in range(1, num_layers):
+                cell = LSTMCell(hidden_size, hidden_size)
+                self.append(RNN(cell, is_reverse, time_major))
+        elif direction == "bidirectional":
+            cell_fw = LSTMCell(input_size, hidden_size)
+            cell_bw = LSTMCell(input_size, hidden_size)
+            self.append(BiRNN(cell_fw, cell_bw, time_major))
+            for i in range(1, num_layers):
+                cell_fw = LSTMCell(2 * hidden_size, hidden_size)
+                cell_bw = LSTMCell(2 * hidden_size, hidden_size)
+                self.append(BiRNN(cell_fw, cell_bw, time_major))
+        else:
+            raise ValueError(
+                "direction should be forward, backward or bidirectional, "
+                "received direction = {}".format(direction))
+
         self.dropout = dropout
-        self.direction = direction
-        self.num_directions = 2 if direction == 'bidirect' else 1
+        self.num_directions = 2 if direction == "bidirectional" else 1
         self.time_major = time_major
+        self.num_layers = num_layers
 
-        if direction == 'bidirect':
-            param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr)
-            bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr)
-            fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0],
-                                                             num_layers)
-            bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1],
-                                                             num_layers)
-            fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0],
-                                                            num_layers)
-            bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1],
-                                                            num_layers)
-
-            # maybe design cell including both forward and backward later
-            merge_mode = 'concat'
-            rnns = []
-            for i in range(num_layers):
-                cell_fw = StackedLSTMCell(input_size if i == 0 else (
-                    hidden_size * 2 if merge_mode == 'concat' else
-                    hidden_size), hidden_size, 1, dropout, fw_param_attrs[i],
-                                          fw_bias_attrs[i], dtype)
-                cell_bw = StackedLSTMCell(input_size if i == 0 else (
-                    hidden_size * 2 if merge_mode == 'concat' else
-                    hidden_size), hidden_size, 1, dropout, bw_param_attrs[i],
-                                          bw_bias_attrs[i], dtype)
-                rnns.append(
-                    BidirectionalRNN(
-                        cell_fw,
-                        cell_bw,
-                        merge_mode=merge_mode,
-                        time_major=time_major))
-            self.lstm = LayerList(rnns)
-        else:
-            lstm_cell = StackedLSTMCell(input_size, hidden_size, num_layers,
-                                        dropout, param_attr, bias_attr, dtype)
-            self.lstm = RNN(lstm_cell,
-                            is_reverse=(direction == "backward"),
-                            time_major=time_major)
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        batch_index = 1 if self.time_major else 0
+        batch_size = inputs.shape[batch_index] if fluid.in_dygraph_mode() \
+                     else layers.shape(inputs)[batch_index]
+        if initial_states is None:
+            state_shape = (self.num_directions * self.num_layers, batch_size,
+                           self.hidden_size)
+            init_h = layers.zeros(state_shape, dtype=inputs.dtype)
+            init_c = layers.zeros(state_shape, dtype=inputs.dtype)
+            initial_states = (init_h, init_c)
+
+        states = split_states(initial_states, self.num_directions == 2, 2)
+        final_states = []
+        for i, rnn_layer in enumerate(self):
+            if i > 0:
+                inputs = layers.dropout(
+                    inputs,
+                    self.dropout,
+                    dropout_implementation="upscale_in_train")
+            outputs, final_state = rnn_layer(inputs, states[i], sequence_length)
+            final_states.append(final_state)
+            inputs = outputs
 
-    def forward(self, input, initial_states=None, sequence_length=None):
-        """
-        Performs the stacked multi-layer LSTM layer by layer. Each LSTM's `outputs`
-        is the `inputs` of the subsequent one.
-        Parameters:
-            inputs (Variable): The inputs for the first LSTM. It is a float32
-                or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
-            initial_states (list|None, optional): A list containing initial states 
-                of all stacked LSTM, and the initial states of each LSTM is a pair
-                of tensors shaped `[batch_size, hidden_size]`. If not provided,
-                use 0 as initial states. Default None.
-            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
-                It stores real length of each instance, thus enables users to extract
-                the last valid state when past a batch element's sequence length for
-                correctness. If not provided, the paddings would be treated same as
-                non-padding inputs. Default None.
-        Returns:
-            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
-                is the output of last LSTM and it is a tensor with shape \
-                `[batch_size, sequence_length, hidden_size]` and has the same \
-                data type as `inputs`, `final_states` is the counterpart of \
-                `initial_states` at last time step, thus has the same structure \
-                with it and has tensors with same shapes data types. 
-        """
-        if not isinstance(self.lstm, LayerList):
-            return self.lstm(input, initial_states, sequence_length)
+        final_states = concat_states(final_states, self.num_directions == 2, 2)
+        return outputs, final_states
+
+
+class GRU(LayerList):
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 direction="forward",
+                 dropout=0.,
+                 time_major=False,
+                 name=None):
+        super(GRU, self).__init__()
+
+        if direction in ["forward", "backward"]:
+            is_reverse = direction == "backward"
+            cell = GRUCell(input_size, hidden_size)
+            self.append(RNN(cell, is_reverse, time_major))
+            for i in range(1, num_layers):
+                cell = GRUCell(hidden_size, hidden_size)
+                self.append(RNN(cell, is_reverse, time_major))
+        elif direction == "bidirectional":
+            cell_fw = GRUCell(input_size, hidden_size)
+            cell_bw = GRUCell(input_size, hidden_size)
+            self.append(BiRNN(cell_fw, cell_bw, time_major))
+            for i in range(1, num_layers):
+                cell_fw = GRUCell(2 * hidden_size, hidden_size)
+                cell_bw = GRUCell(2 * hidden_size, hidden_size)
+                self.append(BiRNN(cell_fw, cell_bw, time_major))
         else:
-            if isinstance(initial_states, (list, tuple)):
-                assert len(initial_states) == self.num_layers, (
-                    "length of initial_states should be %d when it is a list|tuple"
-                    % self.num_layers)
-            else:
-                initial_states = [initial_states] * self.num_layers
-            stacked_states = []
-            for i in range(self.num_layers):
-                output, states = self.lstm[i](input, initial_states[i],
-                                              sequence_length)
-                input = output
-                stacked_states.append(states)
-            return output, stacked_states
-
-
-class GRU(Layer):
-    pass
+            raise ValueError(
+                "direction should be forward, backward or bidirectional, "
+                "received direction = {}".format(direction))
+
+        self.dropout = dropout
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.time_major = time_major
+        self.num_layers = num_layers
+
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        batch_index = 1 if self.time_major else 0
+        batch_size = inputs.shape[batch_index] if fluid.in_dygraph_mode() \
+                     else layers.shape(inputs)[batch_index]
+        if initial_states is None:
+            state_shape = (self.num_directions * self.num_layers, batch_size,
+                           self.hidden_size)
+            initial_states = layers.zeros(state_shape, dtype=inputs.dtype)
+        states = split_states(initial_states, self.num_directions == 2)
+
+        final_states = []
+        for i, rnn_layer in enumerate(self):
+            if i > 0:
+                inputs = layers.dropout(
+                    inputs,
+                    self.dropout,
+                    dropout_implementation="upscale_in_train")
+            outputs, final_state = rnn_layer(inputs, states[i], sequence_length)
+            final_states.append(final_state)
+            inputs = outputs
+
+        final_states = concat_states(final_states, self.num_directions == 2)
+        return outputs, final_states
+
+
+# class LSTM(Layer):
+#     """
+#     Applies a stacked multi-layer long short-term memory (LSTM) RNN to an input
+#     sequence.
+
+#     The formula for LSTM used here is as follows:
+
+#     .. math::
+#         i_{t} & = \sigma(W_{x_{i}}x_{t} + b_{x_{i}} + W_{h_{i}}h_{t-1} + b_{h_{i}})
+#         f_{t} & = \sigma(W_{x_{f}}x_{t} + b_{x_{f}} + W_{h_{f}}h_{t-1} + b_{h_{f}})
+#         o_{t} & = \sigma(W_{x_{o}}x_{t} + b_{x_{o}} + W_{h_{o}}h_{t-1} + b_{h_{o}})
+#         c_{t} & = f_{t}c_{t-1} + i_{t} \\tanh (W_{x_{c}}x_{t} + b_{x_{c}} + W_{h_{c}}h_{t-1} + b_{h_{c}})
+#         h_{t} & = o_{t} \\tanh (c_{t})
+
+#     Parameters:
+#         input_size (int): The input feature size for the first LSTM.
+#         hidden_size (int): The hidden size for every LSTM.
+#         num_layers(int, optional): The number of LSTM to be stacked. Default 1.
+#         dropout(float, optional): The dropout probability applied on the outputs
+#             of each LSTM except the last one. 0 for not dropout. Default 0.0
+#         direction (str, optional): Indicate the direction for LSTM calculation
+#             applying on the input sequences. It can be `forward`, `backward` or
+#             `bidirect`. If it is `backward`, calculate in the reverse order of
+#             input sequences. If it is `bidirect`, each layer would be a 
+#             bidirectional LSTM composed of a `forward` LSTM and `backward` LSTM,
+#             and it concatenates their outputs as outputs. Default: `forward`.
+#         time_major (bool, optional): Indicate the data layout of Tensor included
+#             in `input` and `output` tensors. If `False`, the data layout would
+#             be batch major with shape `[batch_size, sequence_length, ...]`.  If
+#             `True`, the data layout would be time major with shape
+#             `[sequence_length, batch_size, ...]`. Default: `False`.
+#         param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+#             converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+#             a list or tuple, it's length must equal to `num_layers`. Otherwise,
+#             construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+#             Default None.
+#         bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+#             converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+#             a list or tuple, it's length must equal to `num_layers`. Otherwise,
+#             construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+#             Default None.
+#         dtype(string, optional): The data type used in this cell. It can be
+#             float32 or float64. Default float32.
+#     Examples:
+#         .. code-block:: python
+#             import paddle
+#             import paddle.fluid as fluid
+#             from paddle.incubate.hapi.text import LSTM
+#             inputs = paddle.rand((2, 4, 32))
+#             lstm = LSTM(input_size=32, hidden_size=64, num_layers=2)
+#             outputs, _ = lstm(inputs)  # [2, 4, 64]
+#     """
+
+#     def __init__(self,
+#                  input_size,
+#                  hidden_size,
+#                  num_layers=1,
+#                  direction="forward",
+#                  dropout=0.0,
+#                  time_major=False,
+#                  name=None):
+#         super(LSTM, self).__init__()
+#         self.input_size = input_size
+#         self.hidden_size = hidden_size
+#         self.num_layers = num_layers
+#         self.dropout = dropout
+#         self.direction = direction
+#         self.num_directions = 2 if direction == 'bidirect' else 1
+#         self.time_major = time_major
+
+#         if direction == 'bidirect':
+#             param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr)
+#             bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr)
+#             fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0],
+#                                                              num_layers)
+#             bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1],
+#                                                              num_layers)
+#             fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0],
+#                                                             num_layers)
+#             bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1],
+#                                                             num_layers)
+
+#             # maybe design cell including both forward and backward later
+#             merge_mode = 'concat'
+#             rnns = []
+#             for i in range(num_layers):
+#                 cell_fw = StackedLSTMCell(input_size if i == 0 else (
+#                     hidden_size * 2 if merge_mode == 'concat' else
+#                     hidden_size), hidden_size, 1, dropout, fw_param_attrs[i],
+#                                           fw_bias_attrs[i], dtype)
+#                 cell_bw = StackedLSTMCell(input_size if i == 0 else (
+#                     hidden_size * 2 if merge_mode == 'concat' else
+#                     hidden_size), hidden_size, 1, dropout, bw_param_attrs[i],
+#                                           bw_bias_attrs[i], dtype)
+#                 rnns.append(
+#                     BidirectionalRNN(
+#                         cell_fw,
+#                         cell_bw,
+#                         merge_mode=merge_mode,
+#                         time_major=time_major))
+#             self.lstm = LayerList(rnns)
+#         else:
+#             lstm_cell = StackedLSTMCell(input_size, hidden_size, num_layers,
+#                                         dropout, param_attr, bias_attr, dtype)
+#             self.lstm = RNN(lstm_cell,
+#                             is_reverse=(direction == "backward"),
+#                             time_major=time_major)
+
+#     def forward(self, input, initial_states=None, sequence_length=None):
+#         """
+#         Performs the stacked multi-layer LSTM layer by layer. Each LSTM's `outputs`
+#         is the `inputs` of the subsequent one.
+#         Parameters:
+#             inputs (Variable): The inputs for the first LSTM. It is a float32
+#                 or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
+#             initial_states (list|None, optional): A list containing initial states 
+#                 of all stacked LSTM, and the initial states of each LSTM is a pair
+#                 of tensors shaped `[batch_size, hidden_size]`. If not provided,
+#                 use 0 as initial states. Default None.
+#             sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+#                 It stores real length of each instance, thus enables users to extract
+#                 the last valid state when past a batch element's sequence length for
+#                 correctness. If not provided, the paddings would be treated same as
+#                 non-padding inputs. Default None.
+#         Returns:
+#             tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
+#                 is the output of last LSTM and it is a tensor with shape \
+#                 `[batch_size, sequence_length, hidden_size]` and has the same \
+#                 data type as `inputs`, `final_states` is the counterpart of \
+#                 `initial_states` at last time step, thus has the same structure \
+#                 with it and has tensors with same shapes data types. 
+#         """
+#         if not isinstance(self.lstm, LayerList):
+#             return self.lstm(input, initial_states, sequence_length)
+#         else:
+#             if isinstance(initial_states, (list, tuple)):
+#                 assert len(initial_states) == self.num_layers, (
+#                     "length of initial_states should be %d when it is a list|tuple"
+#                     % self.num_layers)
+#             else:
+#                 initial_states = [initial_states] * self.num_layers
+#             stacked_states = []
+#             for i in range(self.num_layers):
+#                 output, states = self.lstm[i](input, initial_states[i],
+#                                               sequence_length)
+#                 input = output
+#                 stacked_states.append(states)
+#             return output, stacked_states
 
 
 # TODO: restucture RNN layers

From 5fb65ba119f42f20e6af9fdc9fcefbc848759a30 Mon Sep 17 00:00:00 2001
From: chenfeiyu <chenfeiyu@baidu.com>
Date: Thu, 20 Aug 2020 20:12:13 +0800
Subject: [PATCH 04/14] refine rnn APIs and docstrings.

---
 python/paddle/fluid/layers/rnn.py       |  143 ++-
 python/paddle/nn/functional/__init__.py |    5 +-
 python/paddle/nn/functional/rnn.py      |    8 +-
 python/paddle/nn/layer/rnn.py           | 1515 +++++++++++------------
 4 files changed, 826 insertions(+), 845 deletions(-)

diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index e922cd48267c8..ae6539370f25f 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -438,24 +438,22 @@ def rnn(cell,
         is_reverse=False,
         **kwargs):
     """
-	:api_attr: Static Graph
-
     rnn creates a recurrent neural network specified by RNNCell `cell`,
-    which performs :code:`cell.call()` repeatedly until reaches to the maximum
-    length of `inputs`.
+    which performs :code:`cell.call()` (for dygraph mode :code:`cell.forward`) 
+    repeatedly until reaches to the maximum length of `inputs`.
 
-    Parameters:
-        cell(RNNCell): An instance of `RNNCell`.
-        inputs(Variable): A (possibly nested structure of) tensor variable[s]. 
+    Arguments:
+        cell(RNNCellBase): An instance of `RNNCellBase`.
+        inputs(Tensor): A (possibly nested structure of) tensor[s]. 
             The shape of tensor should be `[batch_size, sequence_length, ...]`
             for `time_major == False` or `[sequence_length, batch_size, ...]`
             for `time_major == True`. It represents the inputs to be unrolled
             in RNN.
-        initial_states(Variable, optional): A (possibly nested structure of)
-            tensor variable[s], representing the initial state for RNN. 
+        initial_states(Tensor, optional): A (possibly nested structure of)
+            tensor[s], representing the initial state for RNN. 
             If not provided, `cell.get_initial_states` would be used to produce
             the initial state. Default None.
-        sequence_length(Variable, optional): A tensor with shape `[batch_size]`.
+        sequence_length(Tensor, optional): A tensor with shape `[batch_size]`.
             It stores real length of each instance, thus enables users to extract
             the last valid state when past a batch element's sequence length for
             correctness. If not provided, the paddings would be treated same as
@@ -470,30 +468,33 @@ def rnn(cell,
         **kwargs: Additional keyword arguments. Arguments passed to `cell.call`. 
 
     Returns:
-        tuple: A tuple( :code:`(final_outputs, final_states)` ) including the final \
-            outputs and states, both are Tensor or nested structure of Tensor. \
-            `final_outputs` has the same structure and data types as \
-            the returned `outputs` of :code:`cell.call` , and each Tenser in `final_outputs` \
-            stacks all time steps' counterpart in `outputs` thus has shape `[batch_size, sequence_length, ...]` \
-            for `time_major == False` or `[sequence_length, batch_size, ...]` for `time_major == True`. \
-            `final_states` is the counterpart at last time step of initial states, \
-            thus has the same structure with it and has tensors with same shapes \
-            and data types.
+        (outputs, final_states)
+        outputs (Tensor|list|tuple): the output sequence. Tensor or nested 
+            structure of Tensor.
+            If `time_major` is True, the shape of each tensor in outpus is 
+            `[time_steps, batch_size, hidden_size]`, else 
+            `[batch_size, time_steps, hidden_size]`.
+        final_states (Tensor|list|tuple): final states. A (possibly nested structure of)
+            tensor[s], representing the final state for RNN. It has the same 
+            structure of intial state. Each tensor in final states has the same
+            shape and dtype as the corresponding tensor in initial states.
             
 
     Examples:
 
         .. code-block:: python
-            
-            import paddle.fluid as fluid
 
-            inputs = fluid.data(name="inputs",
-                                shape=[-1, 32, 128],
-                                dtype="float32")
-            cell = fluid.layers.GRUCell(hidden_size=128)
-            outputs = fluid.layers.rnn(cell=cell, inputs=inputs)
+            import paddle
+            paddle.disable_static()
+
+            cell = paddle.nn.SimpleRNNCell(16, 32)
+
+            inputs = paddle.rand((4, 23, 16))
+            prev_h = paddle.randn((4, 32))
+            outputs, final_states = paddle.nn.functional.rnn(cell, inputs, prev_h) 
+
     """
-    if in_dygraph_mode:
+    if in_dygraph_mode():
         return _rnn_dynamic_graph(cell, inputs, initial_states, sequence_length,
                                   time_major, is_reverse, **kwargs)
     else:
@@ -529,7 +530,6 @@ def _rnn_dynamic_graph(cell,
                        time_major=False,
                        is_reverse=False,
                        **kwargs):
-
     time_step_index = 0 if time_major else 1
     flat_inputs = flatten(inputs)
     time_steps = flat_inputs[0].shape[time_step_index]
@@ -589,16 +589,6 @@ def _rnn_static_graph(cell,
                                      ['float32', 'float64'], 'rnn')
     check_type(initial_states, 'initial_states',
                (Variable, list, tuple, type(None)), 'rnn')
-    if isinstance(initial_states, (list, tuple)):
-        states = map_structure(lambda x: x, initial_states)[0]
-        for i, state in enumerate(states):
-            if isinstance(state, (list, tuple)):
-                for j, state_j in enumerate(state):
-                    check_variable_and_dtype(state_j, 'state_j[' + str(j) + ']',
-                                             ['float32', 'float64'], 'rnn')
-            else:
-                check_variable_and_dtype(state, 'states[' + str(i) + ']',
-                                         ['float32', 'float64'], 'rnn')
 
     check_type(sequence_length, 'sequence_length', (Variable, type(None)),
                'rnn')
@@ -661,6 +651,83 @@ def _switch_grad(x, stop=False):
     return (final_outputs, final_states)
 
 
+def birnn(cell_fw, cell_bw, inputs, initial_states, sequence_length,
+          time_major):
+    """
+    birnn creates a bidirectional recurrent neural network specified by 
+    RNNCell `cell_fw` and `cell_bw`, which performs :code:`cell.call()` 
+    (for dygraph mode :code:`cell.forward`) repeatedly until reaches to 
+    the maximum length of `inputs` and then concat the ouputs for both RNNs
+    along the last axis.
+
+    Arguments:
+        cell(RNNCellBase): An instance of `RNNCellBase`.
+        inputs(Tensor): A (possibly nested structure of) tensor[s]. 
+            The shape of tensor should be `[batch_size, sequence_length, ...]`
+            for `time_major == False` or `[sequence_length, batch_size, ...]`
+            for `time_major == True`. It represents the inputs to be unrolled
+            in RNN.
+        initial_states(tuple, optional): A tuple of 
+            If not provided, `cell.get_initial_states` would be used to produce
+            the each initial state. Defaults to None.
+        sequence_length(Tensor, optional): A tensor with shape `[batch_size]`.
+            It stores real length of each instance, thus enables users to extract
+            the last valid state when past a batch element's sequence length for
+            correctness. If not provided, the paddings would be treated same as
+            non-padding inputs. Default None.
+        time_major(bool, optional): Indicate the data layout of Tensor included
+            in `input` and `output` tensors. If `False`, the data layout would
+            be batch major with shape `[batch_size, time_steps, ...]`.  If
+            `True`, the data layout would be time major with shape
+            `[time_steps, batch_size, ...]`. Default: `False`.
+        **kwargs: Additional keyword arguments. Arguments passed to `cell.call`. 
+
+    Returns:
+        outputs (Tensor): A (possibly nested structure of) tensor variable[s],
+            the outputs of the bidirectional RNN. It is the concatenation 
+            of the outputs for both the forward RNN and backward RNN along
+            the last axis. 
+            The shape of tensor should be `[batch_size, time_steps, ...]`
+            for `time_major == False` or `[time_steps, batch_size, ...]`
+            for `time_major == True`.
+        final_states (tuple): A tuple of the final states of the forward 
+            cell and backward cell. 
+            
+
+    Examples:
+
+        .. code-block:: python
+            
+            import paddle
+            paddle.disable_static()
+
+            cell_fw = LSTMCell(16, 32)
+            cell_bw = LSTMCell(16, 32)
+            inputs = paddle.rand((2, 23, 16))
+            outputs, final_states = paddle.nn.functional.birnn(cell_fw, cell_bw, inputs)
+        
+    """
+    states_fw, states_bw = initial_states
+    outputs_fw, states_fw = rnn(cell_fw,
+                                inputs,
+                                states_fw,
+                                sequence_length,
+                                time_major=time_major)
+
+    outputs_bw, states_bw = rnn(cell_bw,
+                                inputs,
+                                states_bw,
+                                sequence_length,
+                                time_major=time_major,
+                                is_reverse=True)
+
+    outputs = map_structure(lambda x, y: tensor.concat([x, y], -1), outputs_fw,
+                            outputs_bw)
+
+    final_states = (states_fw, states_bw)
+    return outputs, final_states
+
+
 class Decoder(object):
     """
 	:api_attr: Static Graph
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index bc71b8bdf06d2..81ab6c62f0915 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -156,9 +156,8 @@
 from .pooling import pool3d  #DEFINE_ALIAS
 from .pooling import adaptive_pool2d  #DEFINE_ALIAS
 from .pooling import adaptive_pool3d  #DEFINE_ALIAS
-# from .rnn import gru_unit        #DEFINE_ALIAS
-# from .rnn import lstm        #DEFINE_ALIAS
-# from .rnn import lstm_unit        #DEFINE_ALIAS
+from .rnn import rnn  #DEFINE_ALIAS
+from .rnn import birnn  #DEFINE_ALIAS
 from .vision import affine_channel  #DEFINE_ALIAS
 from .vision import affine_grid  #DEFINE_ALIAS
 from .vision import anchor_generator  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/rnn.py b/python/paddle/nn/functional/rnn.py
index 520cf44360dc3..b7a97bc5aa303 100644
--- a/python/paddle/nn/functional/rnn.py
+++ b/python/paddle/nn/functional/rnn.py
@@ -12,10 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define function of recurrent neural network  
+from paddle.fluid.layers.rnn import rnn, birnn
 
-__all__ = [
-    #       'gru_unit',
-    #       'lstm',
-    #       'lstm_unit'
-]
+__all__ = ['rnn', 'birnn']
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index e93a34f59a09c..d2bdd4f80c42a 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -21,16 +21,14 @@
 import warnings
 from functools import partial, reduce
 
-from ... import fluid
-from ...fluid import layers
-from ...fluid import initializer as I
-from ...fluid.data_feeder import convert_dtype
-from ...fluid.dygraph import Layer, LayerList
-from ...fluid.param_attr import ParamAttr
-from ...fluid.layers import utils, BeamSearchDecoder
-from ...fluid.layers.utils import map_structure, flatten, pack_sequence_as
-
-# TODO: define classes of recurrent neural network
+import paddle
+from paddle import framework
+from paddle.nn import functional as F
+from paddle.nn import initializer as I
+from paddle.fluid.dygraph import Layer, LayerList
+from paddle.fluid.layers import utils
+from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
+from paddle.fluid.data_feeder import convert_dtype
 
 __all__ = [
     'RNNCellBase',
@@ -42,21 +40,55 @@
     'SimpleRNN',
     'LSTM',
     'GRU',
-    # 'StackedRNNCell',
-    # 'StackedLSTMCell',
-    # 'stackedGRUCell',
 ]
 
 
 def split_states(states, bidirectional=False, state_components=1):
+    """
+    Split states of RNN network into possibly nested list or tuple of
+    states of each RNN cells of the RNN network.
+
+    Arguments:
+        states (Tensor|tuple|list): the concatenated states for RNN network.
+            When ``state_components`` is 1, states in a Tensor with shape
+            ``(L\*D, N, C)`` where ``L`` is the number of layers of the RNN 
+            network, ``D`` is the number of directions of the RNN network(1 
+            for unidirectional RNNs and 2 for bidirectional RNNs), ``N`` is 
+            the batch size of the input to the RNN network, ``C`` is the 
+            hidden size of the RNN network. 
+
+            When `state_components` is larger than 1, ``states`` is a tuple of 
+            ``state_components`` Tensors that meet the requirements described 
+            above. 
+            
+            For SimpleRNNs and GRUs, ``state_components`` is 1, and for LSTMs, 
+            ``state_components`` is 2.
+        bidirectional (bool): whether the state is of a bidirectional RNN 
+            network. Defaults to False.
+        state_components (int): the number of the components of the states. see
+            ``states`` above. Defaults to 1.
+    
+    Returns:
+        A nested list or tuple of RNN cell states. 
+        If ``bidirectional`` is True, it can be indexed twice to get an RNN 
+        cell state. The first index indicates the layer, the second index 
+        indicates the direction.
+        If ``bidirectional`` is False, it can be indexed once to get an RNN
+        cell state. The index indicates the layer.
+        Note that if ``state_components`` is larger than 1, an RNN cell state
+        can be indexed one more time to get a tensor of shape(N, C), where 
+        ``N`` is the batch size of the input to the RNN cell, and ``C`` is the
+        hidden size of the RNN cell.
+    """
     if state_components == 1:
-        states = layers.unstack(states)
+        states = paddle.unstack(states)
         if not bidirectional:
             return states
         else:
             return list(zip(states[::2], states[1::2]))
     else:
-        states = tuple([layers.unstack(item) for item in states])
+        assert len(states) == state_components
+        states = tuple([paddle.unstack(item) for item in states])
         if not bidirectional:
             return list(zip(*states))
         else:
@@ -65,36 +97,45 @@ def split_states(states, bidirectional=False, state_components=1):
 
 
 def concat_states(states, bidirectional=False, state_components=1):
+    """
+    Concatenate a possibly nested list or tuple of RNN cell states into a 
+    compact form.
+
+    Arguments:
+        states (list|tuple): a possibly nested list or tuple of RNN cell 
+            states. 
+            If ``bidirectional`` is True, it can be indexed twice to get an 
+            RNN cell state. The first index indicates the layer, the second 
+            index indicates the direction.
+            If ``bidirectional`` is False, it can be indexed once to get an RNN
+            cell state. The index indicates the layer.
+            Note that if ``state_components`` is larger than 1, an RNN cell 
+            state can be indexed one more time to get a tensor of shape(N, C), 
+            where ``N`` is the batch size of the input to the RNN cell, and 
+            ``C`` is the hidden size of the RNN cell. 
+        bidirectional (bool): whether the state is of a bidirectional RNN 
+            network. Defaults to False.
+        state_components (int): the number of the components of the states. see
+            ``states`` above. Defaults to 1.
+    
+    Returns:
+        Concatenated states for RNN network.
+        When ``state_components`` is 1, states in a Tensor with shape
+        ``(L\*D, N, C)`` where ``L`` is the number of layers of the RNN 
+        network, ``D`` is the number of directions of the RNN network(1 for 
+        unidirectional RNNs and 2 for bidirectional RNNs), ``N`` is the batch 
+        size of the input to the RNN network, ``C`` is the hidden size of the 
+        RNN network.
+        
+    """
     if state_components == 1:
-        return layers.stack(flatten(states))
+        return paddle.stack(flatten(states))
     else:
         states = flatten(states)
         componnets = []
         for i in range(state_components):
             componnets.append(states[i::state_components])
-        return [layers.stack(item) for item in componnets]
-
-
-def birnn(cell_fw, cell_bw, inputs, states_fw, states_bw, sequence_length,
-          time_major):
-    outputs_fw, states_fw = layers.rnn(cell_fw,
-                                       inputs,
-                                       states_fw,
-                                       sequence_length,
-                                       time_major=time_major)
-
-    outputs_bw, states_bw = layers.rnn(cell_bw,
-                                       inputs,
-                                       states_bw,
-                                       sequence_length,
-                                       time_major=time_major,
-                                       is_reverse=True)
-
-    outputs = map_structure(lambda x, y: layers.concat([x, y], -1), outputs_fw,
-                            outputs_bw)
-
-    final_states = (states_fw, states_bw)
-    return outputs, final_states
+        return [paddle.stack(item) for item in componnets]
 
 
 class RNNCellBase(Layer):
@@ -113,7 +154,7 @@ def get_initial_states(self,
         """
         Generate initialized states according to provided shape, data type and
         value.
-        Parameters:
+        Arguments:
             batch_ref: A (possibly nested structure of) tensor variable[s].
                 The first dimension of the tensor will be used as batch size to
                 initialize states.
@@ -169,13 +210,13 @@ def __init__(self, shape):
         try:
             states_dtypes = self.state_dtype if dtype is None else dtype
         except NotImplementedError:  # use fp32 as default
-            states_dtypes = "float32"
+            states_dtypes = framework.get_default_dtype()
         if len(flatten(states_dtypes)) == 1:
             dtype = flatten(states_dtypes)[0]
             states_dtypes = map_structure(lambda shape: dtype, states_shapes)
 
         init_states = map_structure(
-            lambda shape, dtype: layers.fill_constant_batch_size_like(
+            lambda shape, dtype: paddle.fluid.layers.fill_constant_batch_size_like(
                 input=batch_ref,
                 shape=shape.shape,
                 dtype=dtype,
@@ -215,20 +256,77 @@ def state_dtype(self):
 
 
 class SimpleRNNCell(RNNCellBase):
-    def __init__(self, input_size, hidden_size, nonlinearity="tanh", name=None):
+    r"""
+    Elman RNN (SimpleRNN) cell.
+
+    The formula used is as follows:
+
+    .. math::
+        h_{t} & = \mathrm{tanh}(W_{ih}x_{t} + b_{ih} + W_{hh}h{t-1} + b_{hh})
+        y_{t} & = h_{t}
+    
+    where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise 
+    multiplication operator.
+
+    Please refer to `Finding Structure in Time 
+    <https://crl.ucsd.edu/~elman/Papers/fsit.pdf>`_ for more details.
+    
+    Arguments:
+        input_size (int): The input size.
+        hidden_size (int): The hidden size.
+        nonlinearity (str): The activation in the SimpleRNN cell. It can be 
+            ``tanh`` or ``relu``. Defaults to ``tanh``.
+        weight_ih_attr(ParamAttr, optional): The parameter attribute for 
+            ``weight_ih``. Default: None.
+        weight_hh_attr(ParamAttr, optional): The parameter attribute for 
+            ``weight_hh``. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            ``bias_ih``. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            ``bias_hh``. Default: None.
+        name (str, optional): Name for the operation (optional, default is 
+            None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Examples:
+        .. code-block:: python
+            import paddle
+            paddle.disable_static()
+
+            x = paddle.randn((4, 16))
+            prev_h = paddle.randn((4, 32))
+
+            cell = paddle.nn.SimpleRNNCell(16, 32)
+            y, h = cell(x, prev_h)
+
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 nonlinearity="tanh",
+                 weight_ih_attr=None,
+                 weight_hh_attr=None,
+                 bias_ih_attr=None,
+                 bias_hh_attr=None,
+                 name=None):
         super(SimpleRNNCell, self).__init__()
         std = 1.0 / math.sqrt(hidden_size)
         self.weight_ih = self.create_parameter(
-            (hidden_size, input_size), default_initializer=I.Uniform(-std, std))
+            (hidden_size, input_size),
+            weight_ih_attr,
+            default_initializer=I.Uniform(-std, std))
         self.weight_hh = self.create_parameter(
             (hidden_size, hidden_size),
+            weight_hh_attr,
             default_initializer=I.Uniform(-std, std))
         self.bias_ih = self.create_parameter(
             (hidden_size, ),
+            bias_ih_attr,
             is_bias=True,
             default_initializer=I.Uniform(-std, std))
         self.bias_hh = self.create_parameter(
             (hidden_size, ),
+            bias_hh_attr,
             is_bias=True,
             default_initializer=I.Uniform(-std, std))
 
@@ -239,18 +337,36 @@ def __init__(self, input_size, hidden_size, nonlinearity="tanh", name=None):
                 "nonlinearity for SimpleRNNCell should be tanh or relu, "
                 "but get {}".format(nonlinearity))
         self.nonlinearity = nonlinearity
-        self._nonlinear_fn = layers.tanh \
+        self._nonlinear_fn = paddle.tanh \
             if nonlinearity == "tanh" \
-            else layers.relu
+            else F.relu
 
     def forward(self, inputs, states=None):
+        """
+        Given the input and previous atate, compute the output and update state.
+
+        Arguments:
+            inputs (Tensor): shape `[batch_size, input_size]`, the input, 
+                corresponding to :math:`x_t` in the formula.
+            states (Tensor, optional): shape `[batch_size, hidden_size]`, the
+                previous hidden state, corresponding to :math:`h_{t-1}` in the 
+                formula. When states is None, zero state is used. Defaults to 
+                None.
+        Returns:
+            (outputs, new_states)
+            outputs (Tensor): shape `[batch_size, hidden_size]`, the output, 
+                corresponding to :math:`h_{t}` in the formula.
+            states (Tensor): shape `[batch_size, hidden_size]`, the new hidden 
+                state, corresponding to :math:`h_{t}` in the formula.
+
+        """
         if states is None:
             states = self.get_initial_states(inputs, self.state_shape)
         pre_h = states
-        i2h = layers.matmul(inputs, self.weight_ih, transpose_y=True)
+        i2h = paddle.matmul(inputs, self.weight_ih, transpose_y=True)
         if self.bias_ih is not None:
             i2h += self.bias_ih
-        h2h = layers.matmul(pre_h, self.weight_hh, transpose_y=True)
+        h2h = paddle.matmul(pre_h, self.weight_hh, transpose_y=True)
         if self.bias_hh is not None:
             h2h += self.bias_hh
         h = self._nonlinear_fn(i2h + h2h)
@@ -262,91 +378,119 @@ def state_shape(self):
 
 
 class LSTMCell(RNNCellBase):
-    """
+    r"""
     Long-Short Term Memory(LSTM) RNN cell.
 
     The formula used is as follows:
 
     .. math::
-        i_{t} & = \sigma(W_{x_{i}}x_{t} + b_{x_{i}} + W_{h_{i}}h_{t-1} + b_{h_{i}})
-        f_{t} & = \sigma(W_{x_{f}}x_{t} + b_{x_{f}} + W_{h_{f}}h_{t-1} + b_{h_{f}})
-        o_{t} & = \sigma(W_{x_{o}}x_{t} + b_{x_{o}} + W_{h_{o}}h_{t-1} + b_{h_{o}})
-        c_{t} & = f_{t}c_{t-1} + i_{t} \\tanh (W_{x_{c}}x_{t} + b_{x_{c}} + W_{h_{c}}h_{t-1} + b_{h_{c}})
-        h_{t} & = o_{t} \\tanh (c_{t})
+        i_{t} & = \sigma(W_{ii}x_{t} + b_{ii} + W_{hi}h_{t-1} + b_{hi})
+        f_{t} & = \sigma(W_{if}x_{t} + b_{if} + W_{hf}h_{t-1} + b_{hf})
+        o_{t} & = \sigma(W_{io}x_{t} + b_{io} + W_{ho}h_{t-1} + b_{ho})
+        \\widetilde{c}_{t} & = \\tanh (W_{ig}x_{t} + b_{ig} + W_{hg}h_{t-1} + b_{hg})
+        c_{t} & = f_{t} \* c{t-1} + i{t} \* \\widetile{c}_{t}
+        h_{t} & = o_{t} \* \\tanh(c_{t})
+        y_{t} & = h_{t}
+
+    where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise 
+    multiplication operator.
 
     Please refer to `An Empirical Exploration of Recurrent Network Architectures
     <http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_ for more details.
 
-    Parameters:
-        input_size (int): The input size in the LSTM cell.
-        hidden_size (int): The hidden size in the LSTM cell.
-        param_attr(ParamAttr, optional): The parameter attribute for the learnable
-            weight matrix. Default: None.
-        bias_attr (ParamAttr, optional): The parameter attribute for the bias
-            of LSTM. Default: None.
-        dtype(string, optional): The data type used in this cell. Default float32.
+    Arguments:
+        input_size (int): The input size.
+        hidden_size (int): The hidden size.
+        weight_ih_attr(ParamAttr, optional): The parameter attribute for 
+            ``weight_ih``. Default: None.
+        weight_hh_attr(ParamAttr, optional): The parameter attribute for 
+            ``weight_hh``. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            ``bias_ih``. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            ``bias_hh``. Default: None.
+        name (str, optional): Name for the operation (optional, default is 
+            None). For more information, please refer to :ref:`api_guide_Name`.
 
     Examples:
         .. code-block:: python
+
             import paddle
-            inputs = paddle.rand((2, 4, 32))
-            cell = paddle.LSTMCell(input_size=32, hidden_size=64)
-            rnn = paddle.RNN(cell=cell)
-            outputs, _ = rnn(inputs)  # [2, 4, 64]
+            paddle.disable_static()
+
+            x = paddle.randn((4, 16))
+            prev_h = paddle.randn((4, 32))
+
+            cell = paddle.nn.LSTMCell(16, 32)
+            y, h = cell(x, prev_h)
+
     """
 
-    def __init__(self, input_size, hidden_size, name=None):
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 weight_ih_attr=None,
+                 weight_hh_attr=None,
+                 bias_ih_attr=None,
+                 bias_hh_attr=None,
+                 name=None):
         super(LSTMCell, self).__init__()
         std = 1.0 / math.sqrt(hidden_size)
         self.weight_ih = self.create_parameter(
             (4 * hidden_size, input_size),
+            weight_ih_attr,
             default_initializer=I.Uniform(-std, std))
         self.weight_hh = self.create_parameter(
             (4 * hidden_size, hidden_size),
+            weight_hh_attr,
             default_initializer=I.Uniform(-std, std))
         self.bias_ih = self.create_parameter(
             (4 * hidden_size, ),
+            bias_ih_attr,
             is_bias=True,
             default_initializer=I.Uniform(-std, std))
         self.bias_hh = self.create_parameter(
             (4 * hidden_size, ),
+            bias_hh_attr,
             is_bias=True,
             default_initializer=I.Uniform(-std, std))
 
         self.hidden_size = hidden_size
         self.input_size = input_size
-        self._gate_activation = layers.sigmoid
-        self._activation = layers.tanh
+        self._gate_activation = F.sigmoid
+        self._activation = paddle.tanh
 
     def forward(self, inputs, states=None):
         """
-        Performs single step LSTM calculations.
-        Parameters:
-            inputs (Variable): A tensor with shape `[batch_size, input_size]`,
-                corresponding to :math:`x_t` in the formula. The data type
-                should be float32 or float64.
-            states (Variable): A tuple of two tensors, each shaped
-                `[batch_size, hidden_size]`, corresponding to :math:`h_{t-1}, c_{t-1}`
-                in the formula. The data type should be float32 or float64.
+        Given the input and previous atate, compute the output and update state.
+
+        Arguments:
+            inputs (Tensor): shape `[batch_size, input_size]`, the input, 
+                corresponding to :math:`x_t` in the formula.
+            states (tuple, optional): a tuple of two tensors, each of shape 
+                `[batch_size, hidden_size]`, the previous hidden state, 
+                corresponding to :math:`h_{t-1}, c_{t-1}` in the formula. 
+                When states is None, zero state is used. Defaults to None.
         Returns:
-            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
-                a tensor with shape `[batch_size, hidden_size]`, corresponding \
-                to :math:`h_{t}` in the formula; `new_states` is a list containing \
-                two tenser variables shaped `[batch_size, hidden_size]`, corresponding \
-                to :math:`h_{t}, c_{t}` in the formula. The data type of these \
-                tensors all is same as that of `states`.
+            (outputs, new_states)
+            outputs (Tensor): shape `[batch_size, hidden_size]`, the output, 
+                corresponding to :math:`h_{t}` in the formula.
+            states (tuple): a tuple of two tensors, each of shape 
+                `[batch_size, hidden_size]`, the new hidden states,
+                corresponding to :math:`h_{t}, c{t}` in the formula.
+
         """
         if states is None:
             states = self.get_initial_states(inputs, self.state_shape)
         pre_hidden, pre_cell = states
-        gates = layers.matmul(inputs, self.weight_ih, transpose_y=True)
+        gates = paddle.matmul(inputs, self.weight_ih, transpose_y=True)
         if self.bias_ih is not None:
             gates = gates + self.bias_ih
-        gates += layers.matmul(pre_hidden, self.weight_hh, transpose_y=True)
+        gates += paddle.matmul(pre_hidden, self.weight_hh, transpose_y=True)
         if self.bias_hh is not None:
             gates = gates + self.bias_hh
 
-        chunked_gates = layers.split(gates, num_or_sections=4, dim=-1)
+        chunked_gates = paddle.split(gates, num_or_sections=4, axis=-1)
 
         i = self._gate_activation(chunked_gates[0])
         f = self._gate_activation(chunked_gates[1])
@@ -359,74 +503,95 @@ def forward(self, inputs, states=None):
     @property
     def state_shape(self):
         """
-        The `state_shape` of BasicLSTMCell is a list with two shapes: `[[hidden_size], [hidden_size]]`
-        (-1 for batch size would be automatically inserted into shape). These two
-        shapes correspond to :math:`h_{t-1}` and :math:`c_{t-1}` separately.
+        The `state_shape` of LSTMCell is a tuple with two shapes: 
+        `((hidden_size, ), (hidden_size,))`. (-1 for batch size would be 
+        automatically inserted into shape). These two shapes correspond 
+        to :math:`h_{t-1}` and :math:`c_{t-1}` separately.
         """
         return ((self.hidden_size, ), (self.hidden_size, ))
 
 
 class GRUCell(RNNCellBase):
-    """
+    r"""
     Gated Recurrent Unit (GRU) RNN cell.
 
     The formula for GRU used is as follows:
 
     .. math::
 
-        u_t & = \sigma(W_{x_{u}}x_{t} + b_{x_{u}} + W_{h_{u}}h_{t-1} + b_{h_{u}})
-
-        r_t & = \sigma(W_{x_{r}}x_{t} + b_{x_{r}} + W_{h_{r}}h_{t-1} + b_{h_{r}})
-
-        \\tilde{h_t} & = \\tanh(W_{x_{c}}x_{t} + r_t \odot (W_{h_{c}}h_{t-1} + b_{h_{c}})
-
-        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
+        r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}x_{t} + b_{hr})
+        z_{t} & = \sigma(W_{iz)x_{t} + b_{iz} + W_{hz}x_{t} + b_{hz})
+        \\widetilde{h}_{t} & = \\tanh(W_{ic)x_{t} + b_{ic} + r_{t} \* (W_{hc}x_{t} + b{hc}))
+        h_{t} & = z_{t} \* h_{t-1} + (1 - z_{t}) \* \\widetilde{h}_{t}
+        y_{t} & = h_{t}
+    
+    where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise 
+    multiplication operator.
 
     Please refer to `An Empirical Exploration of Recurrent Network Architectures
     <http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_ for more details.
 
     Parameters:
-        input_size (int): The input size for the first GRU cell.
-        hidden_size (int): The hidden size for every GRU cell.
-        param_attr(ParamAttr, optional): The parameter attribute for the learnable
-            weight matrix. Default: None.
-        bias_attr (ParamAttr, optional): The parameter attribute for the bias
-            of LSTM. Default: None.
-        dtype(string, optional): The data type used in this cell. Default float32.
+        input_size (int): The input size..
+        hidden_size (int): The hidden size.
+        weight_ih_attr(ParamAttr, optional): The parameter attribute for 
+            ``weight_ih``. Default: None.
+        weight_hh_attr(ParamAttr, optional): The parameter attribute for 
+            ``weight_hh``. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            ``bias_ih``. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            ``bias_hh``. Default: None.
+        name (str, optional): Name for the operation (optional, default is 
+            None). For more information, please refer to :ref:`api_guide_Name`.
 
     Examples:
-
         .. code-block:: python
 
             import paddle
-            inputs = paddle.rand((2, 4, 32))
-            cell = BasicGRUCell(input_size=32, hidden_size=64)
-            rnn = RNN(cell=cell)
-            outputs, _ = rnn(inputs)  # [2, 4, 64]
+            paddle.disable_static()
+
+            x = paddle.randn((4, 16))
+            prev_h = paddle.randn((4, 32))
+
+            cell = paddle.nn.GRUCell(16, 32)
+            y, h = cell(x, prev_h)
+
     """
 
-    def __init__(self, input_size, hidden_size, name=None):
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 weight_ih_attr=None,
+                 weight_hh_attr=None,
+                 bias_ih_attr=None,
+                 bias_hh_attr=None,
+                 name=None):
         super(GRUCell, self).__init__()
         std = 1.0 / math.sqrt(hidden_size)
         self.weight_ih = self.create_parameter(
             (3 * hidden_size, input_size),
+            weight_ih_attr,
             default_initializer=I.Uniform(-std, std))
         self.weight_hh = self.create_parameter(
             (3 * hidden_size, hidden_size),
+            weight_hh_attr,
             default_initializer=I.Uniform(-std, std))
         self.bias_ih = self.create_parameter(
             (3 * hidden_size, ),
+            bias_ih_attr,
             is_bias=True,
             default_initializer=I.Uniform(-std, std))
         self.bias_hh = self.create_parameter(
             (3 * hidden_size, ),
+            bias_hh_attr,
             is_bias=True,
             default_initializer=I.Uniform(-std, std))
 
         self.hidden_size = hidden_size
         self.input_size = input_size
-        self._gate_activation = layers.sigmoid
-        self._activation = layers.tanh
+        self._gate_activation = F.sigmoid
+        self._activation = paddle.tanh
 
     def forward(self, inputs, states=None):
         """
@@ -450,15 +615,15 @@ def forward(self, inputs, states=None):
             states = self.get_initial_states(inputs, self.state_shape)
 
         pre_hidden = states
-        x_gates = layers.matmul(inputs, self.weight_ih, transpose_y=True)
+        x_gates = paddle.matmul(inputs, self.weight_ih, transpose_y=True)
         if self.bias_ih is not None:
             x_gates = x_gates + self.bias_ih
-        h_gates = layers.matmul(pre_hidden, self.weight_hh, transpose_y=True)
+        h_gates = paddle.matmul(pre_hidden, self.weight_hh, transpose_y=True)
         if self.bias_hh is not None:
             h_gates = h_gates + self.bias_hh
 
-        x_r, x_z, x_c = layers.split(x_gates, num_or_sections=3, dim=1)
-        h_r, h_z, h_c = layers.split(h_gates, num_or_sections=3, dim=1)
+        x_r, x_z, x_c = paddle.split(x_gates, num_or_sections=3, axis=1)
+        h_r, h_z, h_c = paddle.split(h_gates, num_or_sections=3, axis=1)
 
         r = self._gate_activation(x_r + h_r)
         z = self._gate_activation(x_z + h_z)
@@ -470,7 +635,7 @@ def forward(self, inputs, states=None):
     @property
     def state_shape(self):
         """
-        The `state_shape` of BasicGRUCell is a shape `[hidden_size]` (-1 for batch
+        The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch
         size would be automatically inserted into shape). The shape corresponds
         to :math:`h_{t-1}`.
         """
@@ -479,27 +644,62 @@ def state_shape(self):
 
 class RNN(Layer):
     """
-    RNN creates a recurrent neural network specified by RNNCell `cell`, which
-    performs :code:`cell.forward()` repeatedly until reaches to the maximum
-    length of `inputs`.
+    Wrapper for RNN, which creates a recurrent neural network specified with a
+    RNN cell. It performs :code:`cell.forward()` repeatedly until reaches to 
+    the maximum length of `inputs`.
 
-    Parameters:
-        cell(RNNCell): An instance of `RNNCell`.
+    Arguments:
+        cell(RNNCellBase): An instance of `RNNCell`.
         is_reverse (bool, optional): Indicate whether to calculate in the reverse
-            order of input sequences. Default: `False`.
+            order of input sequences. Defaults to False.
         time_major (bool, optional): Indicate the data layout of Tensor included
             in `input` and `output` tensors. If `False`, the data layout would
-            be batch major with shape `[batch_size, sequence_length, ...]`.  If
+            be batch major with shape `[batch_size, time_steps, ...]`.  If
             `True`, the data layout would be time major with shape
-            `[sequence_length, batch_size, ...]`. Default: `False`.
+            `[time_steps, batch_size, ...]`. Defaults to False.
+
+    Inputs:
+        inputs (Tensor): A (possibly nested structure of) tensor variable[s]. 
+            The shape of tensor should be `[batch_size, time_steps, ...]`
+            for `time_major == False` or `[time_steps, batch_size, ...]`
+            for `time_major == True`. It represents the inputs to be unrolled
+            in RNN.
+        initial_states (Tensor|list|tuple, optional): A (possibly nested structure of)
+            tensor[s], representing the initial state for the rnn cell. 
+            If not provided, `cell.get_initial_states` would be used to produce
+            the initial state. Defaults to None.
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
+            or int32. The valid lengths of input sequences.
+            If `sequence_length` is not None, the inputs are treated as 
+            padded sequences. In each input sequence, elements whos time step 
+            index are not less than the valid length are treated as paddings.
+        **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`. 
+
+    Outputs:
+        (outputs, final_states)
+        outputs (Tensor|list|tuple): the output sequence. Tensor or nested 
+            structure of Tensor.
+            If `time_major` is True, the shape of each tensor in outpus is 
+            `[time_steps, batch_size, hidden_size]`, else 
+            `[batch_size, time_steps, hidden_size]`.
+        final_states (Tensor|list|tuple): final states. A (possibly nested structure of)
+            tensor[s], representing the final state for RNN. It has the same 
+            structure of intial state. Each tensor in final states has the same
+            shape and dtype as the corresponding tensor in initial states.
 
     Examples:
         .. code-block:: python
+
             import paddle
-            inputs = paddle.rand((2, 4, 32))
-            cell = paddle.StackedLSTMCell(input_size=32, hidden_size=64)
-            rnn = paddle.RNN(cell=cell)
-            outputs, _ = rnn(inputs)  # [2, 4, 64]
+            paddle.disable_static()
+
+            inputs = paddle.rand((4, 23, 16))
+            prev_h = paddle.randn((4, 32))
+
+            cell = paddle.nn.SimpleRNNCell(16, 32)
+            rnn = paddle.RNN(cell)
+            outputs, final_states = rnn(inputs, prev_h)
+
     """
 
     def __init__(self, cell, is_reverse=False, time_major=False):
@@ -510,81 +710,85 @@ def __init__(self, cell, is_reverse=False, time_major=False):
             self.cell.call = self.cell.forward
         self.is_reverse = is_reverse
         self.time_major = time_major
-        # self.batch_index, self.time_step_index = (1, 0) \
-        #     if time_major else (0, 1)
 
     def forward(self, inputs, initial_states=None, sequence_length=None):
-        """
-        Performs :code:`cell.forward()` repeatedly until reaches to the maximum
-        length of `inputs`.
-        Parameters:
-            inputs (Variable): A (possibly nested structure of) tensor variable[s]. 
-                The shape of tensor should be `[batch_size, sequence_length, ...]`
-                for `time_major == False` or `[sequence_length, batch_size, ...]`
-                for `time_major == True`. It represents the inputs to be unrolled
-                in RNN.
-            initial_states (Variable, optional): A (possibly nested structure of)
-                tensor variable[s], representing the initial state for RNN. 
-                If not provided, `cell.get_initial_states` would be used to produce
-                the initial state. Default None.
-            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
-                It stores real length of each instance, thus enables users to extract
-                the last valid state when past a batch element's sequence length for
-                correctness. If not provided, the paddings would be treated same as
-                non-padding inputs. Default None.
-            **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`. 
-        Returns:
-            tuple: A tuple( :code:`(final_outputs, final_states)` ) including the final \
-                outputs and states, both are Tensor or nested structure of Tensor. \
-                `final_outputs` has the same structure and data types as \
-                the returned `outputs` of :code:`cell.forward` , and each Tenser in `final_outputs` \
-                stacks all time steps' counterpart in `outputs` thus has shape `[batch_size, sequence_length, ...]` \
-                for `time_major == False` or `[sequence_length, batch_size, ...]` for `time_major == True`. \
-                `final_states` is the counterpart at last time step of initial states, \
-                thus has the same structure with it and has tensors with same shapes \
-                and data types.
-        """
-
         if initial_states is None:
             initial_states = self.cell.get_initial_states(
                 batch_ref=inputs,
                 dtype=inputs.dtype,
                 batch_dim_idx=self.batch_index)
 
-        final_outputs, final_states = layers.rnn(
-            self.cell,
-            inputs,
-            initial_states=initial_states,
-            sequence_length=sequence_length,
-            time_major=self.time_major,
-            is_reverse=self.is_reverse)
+        final_outputs, final_states = F.rnn(self.cell,
+                                            inputs,
+                                            initial_states=initial_states,
+                                            sequence_length=sequence_length,
+                                            time_major=self.time_major,
+                                            is_reverse=self.is_reverse)
         return final_outputs, final_states
 
 
 class BiRNN(Layer):
     """
-    Wrapper for bidirectional RNN. It assembles two RNNCell instances to perform
-    forward and backward RNN separately, and merge outputs of these two RNN
-    according to `merge_mode`.
+    Wrapper for bidirectional RNN. It assembles two RNN cells by performing
+    forward and backward RNN separately, and concat outputs.
+
     Parameters:
-        cell_fw (RNNCell): A RNNCell instance used for forward RNN.
-        cell_bw (RNNCell): A RNNCell instance used for backward RNN.
+        cell_fw (RNNCellBase): A RNNCell instance used for forward RNN.
+        cell_bw (RNNCellBase): A RNNCell instance used for backward RNN.
+        time_major (bool): Whether the first dimension of the input means the
+            time steps.
+
+    Inputs:
+        inputs (Tensor): A (possibly nested structure of) tensor variable[s]. 
+            The shape of tensor should be `[batch_size, sequence_length, ...]`
+            for `time_major == False` or `[sequence_length, batch_size, ...]`
+            for `time_major == True`. It represents the inputs to be unrolled
+            in both forward and backward RNN.
+        initial_states (list|tuple, optional): A tuple of the initial states of 
+            the forward cell and backward cell. 
+            If not provided, `cell.get_initial_states` would be used to produce 
+            the initial states. Defaults to None.
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
+            or int32. The valid lengths of input sequences.
+            If `sequence_length` is not None, the inputs are treated as 
+            padded sequences. In each input sequence, elements whos time step 
+            index are not less than the valid length are treated as paddings.
+        **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`.
+
+    Outputs:
+            outputs (Tensor): A (possibly nested structure of) tensor variable[s],
+                the outputs of the bidirectional RNN. It is the concatenation 
+                of the outputs for both the forward RNN and backward RNN along
+                the last axis. 
+                The shape of tensor should be `[batch_size, time_steps, ...]`
+                for `time_major == False` or `[time_steps, batch_size, ...]`
+                for `time_major == True`.
+            final_states (tuple): A tuple of the final states of the forward 
+                cell and backward cell. 
 
     Examples:
         .. code-block:: python
+
             import paddle
-            from paddle.incubate.hapi.text import StackedLSTMCell, BidirectionalRNN
-            inputs = paddle.rand((2, 4, 32))
-            cell_fw = StackedLSTMCell(32, 64)
-            cell_bw = StackedLSTMCell(32, 64)
-            bi_rnn = BidirectionalRNN(cell_fw, cell_bw)
-            outputs, _ = bi_rnn(inputs)  # [2, 4, 128]
+            paddle.disable_static()
+
+            cell_fw = LSTMCell(16, 32)
+            cell_bw = LSTMCell(16, 32)
+            rnn = BidirectionalRNN(cell_fw, cell_bw)
+
+            inputs = paddle.rand((2, 23, 16))
+            outputs, final_states = rnn(inputs)
+
     """
 
     def __init__(self, cell_fw, cell_bw, time_major=False):
         super(BiRNN, self).__init__()
         self.cell_fw = cell_fw
         self.cell_bw = cell_bw
+        for cell in [self.cell_fw, self.cell_bw]:
+            if not hasattr(cell, "call"):
+                # for non-dygraph mode, `rnn` api uses cell.call
+                cell.call = cell.forward
         self.time_major = time_major
 
     def forward(self,
@@ -592,131 +796,47 @@ def forward(self,
                 initial_states=None,
                 sequence_length=None,
                 **kwargs):
-        """
-        Performs forward and backward RNN separately, and merge outputs of these
-        two RNN according to `merge_mode`.
-        Parameters:
-            inputs (Variable): A (possibly nested structure of) tensor variable[s]. 
-                The shape of tensor should be `[batch_size, sequence_length, ...]`
-                for `time_major == False` or `[sequence_length, batch_size, ...]`
-                for `time_major == True`. It represents the inputs to be unrolled
-                in both forward and backward RNN.
-            initial_states (Variable|list|tuple): If it is a list or tuple, its
-                length should be 2 to include initial states of forward and backward
-                RNN separately. Otherwise it would be used twice for the two RNN. 
-                If None, `cell.get_initial_states` would be used to produce the initial
-                states. Default None.
-            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
-                It stores real length of each instance, thus enables users to extract
-                the last valid state when past a batch element's sequence length for
-                correctness. If not provided, the paddings would be treated same as
-                non-padding inputs. Default None.
-            **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`.
-        Returns:
-            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
-                is produced by merge outputs of forward and backward RNN according \
-                to `merge_mode`; similarly, `final_states` is produced by merge \
-                `final_states` of forward and backward RNN.
-        """
         if isinstance(initial_states, (list, tuple)):
             assert len(initial_states) == 2, \
                 "length of initial_states should be 2 when it is a list/tuple"
         else:
             initial_states = [initial_states, initial_states]
-        states_fw, states_bw = initial_states
 
-        outputs, final_states = birnn(self.cell_fw, self.cell_bw, inputs,
-                                      states_fw, states_bw, sequence_length,
-                                      self.time_major)
+        outputs, final_states = F.birnn(self.cell_fw, self.cell_bw, inputs,
+                                        initial_states, sequence_length,
+                                        self.time_major)
         return outputs, final_states
 
-    @staticmethod
-    def bidirect_param_attr(param_attr):
-        """
-        Converts `param_attr` to a pair of `param_attr` when it is not a list
-        or tuple with length 2, also rename every one by appending a suffix to
-        avoid having same names when `param_attr` contains a name.
-
-        Parameters:
-            param_attr (list|tuple|ParamAttr): A list, tuple or something can be
-                converted to a ParamAttr instance by `ParamAttr._to_attr`. When
-                it is a list or tuple, its length must be 2.
-
-        Returns:
-            list: A pair composed of forward and backward RNN cell's `param_attr`.
-        """
-        if isinstance(param_attr, (list, tuple)):
-            assert len(
-                param_attr
-            ) == 2, "length of param_attr should be 2 when it is a list/tuple"
-            param_attrs = param_attr
-        else:
-            param_attrs = []
-            attr = ParamAttr._to_attr(param_attr)
-            attr_fw = copy.deepcopy(attr)
-            if attr.name:
-                attr_fw.name = attr_fw.name + "_fw"
-            param_attrs.append(attr_fw)
-            attr_bw = copy.deepcopy(attr)
-            if attr.name:
-                attr_bw.name = attr_bw.name + "_bw"
-            param_attrs.append(attr_bw)
-        return param_attrs
-
-
-class SimpleRNN(LayerList):
-    def __init__(self,
-                 input_size,
-                 hidden_size,
-                 num_layers=1,
-                 nonlinearity="tanh",
-                 direction="forward",
-                 dropout=0.,
-                 time_major=False,
-                 name=None):
-        super(SimpleRNN, self).__init__()
-
-        if direction in ["forward", "backward"]:
-            is_reverse = direction == "backward"
-            cell = SimpleRNNCell(input_size, hidden_size, nonlinearity)
-            self.append(RNN(cell, is_reverse, time_major))
-            for i in range(1, num_layers):
-                cell = SimpleRNNCell(hidden_size, hidden_size, nonlinearity)
-                self.append(RNN(cell, is_reverse, time_major))
-        elif direction == "bidirectional":
-            cell_fw = SimpleRNNCell(input_size, hidden_size, nonlinearity)
-            cell_bw = SimpleRNNCell(input_size, hidden_size, nonlinearity)
-            self.append(BiRNN(cell_fw, cell_bw, time_major))
-            for i in range(1, num_layers):
-                cell_fw = SimpleRNNCell(2 * hidden_size, hidden_size,
-                                        nonlinearity)
-                cell_bw = SimpleRNNCell(2 * hidden_size, hidden_size,
-                                        nonlinearity)
-                self.append(BiRNN(cell_fw, cell_bw, time_major))
-        else:
-            raise ValueError(
-                "direction should be forward, backward or bidirectional, "
-                "received direction = {}".format(direction))
 
-        self.dropout = dropout
-        self.num_directions = 2 if direction == "bidirectional" else 1
-        self.time_major = time_major
-        self.num_layers = num_layers
+class RNNMixin(LayerList):
+    r"""
+    A Mixin class for RNN networks. It provides forward method for SimpleRNN,
+    LSTM and GRU.
+    """
 
     def forward(self, inputs, initial_states=None, sequence_length=None):
         batch_index = 1 if self.time_major else 0
-        batch_size = inputs.shape[batch_index] if fluid.in_dygraph_mode() \
-                     else layers.shape(inputs)[batch_index]
+        dtype = inputs.dtype
         if initial_states is None:
-            state_shape = (self.num_directions * self.num_layers, batch_size,
+            state_shape = (self.num_layers * self.num_directions, -1,
                            self.hidden_size)
-            initial_states = layers.zeros(state_shape, dtype=inputs.dtype)
-
-        states = split_states(initial_states, self.num_directions == 2)
+            if self.state_components == 1:
+                initial_states = paddle.fluid.layers.fill_constant_batch_size_like(
+                    inputs, state_shape, dtype, 0, batch_index, 1)
+            else:
+                initial_states = tuple([
+                    paddle.fluid.layers.fill_constant_batch_size_like(
+                        inputs, state_shape, dtype, 0, batch_index, 1)
+                    for _ in range(self.state_components)
+                ])
+
+        states = split_states(initial_states, self.num_directions == 2,
+                              self.state_components)
         final_states = []
+
         for i, rnn_layer in enumerate(self):
             if i > 0:
-                inputs = layers.dropout(
+                inputs = F.dropout(
                     inputs,
                     self.dropout,
                     dropout_implementation="upscale_in_train")
@@ -724,74 +844,225 @@ def forward(self, inputs, initial_states=None, sequence_length=None):
             final_states.append(final_state)
             inputs = outputs
 
-        final_states = concat_states(final_states, self.num_directions == 2)
+        final_states = concat_states(final_states, self.num_directions == 2,
+                                     self.state_components)
         return outputs, final_states
 
 
-class LSTM(LayerList):
+class SimpleRNN(RNNMixin):
+    r"""
+    Multilayer Elman network(SimpleRNN). It takes a sequence and an initial 
+    state as inputs, and returns the output sequence and the final state.
+
+    Each layer inside the SimpleRNN maps the input sequence and initial state 
+    to the output sequence and final state in the following manner: at each 
+    step, it takes step input(:math:`x_{t}`) and previous 
+    state(:math:`h_{t-1}`) as inputs, and returns step output(:math:`y_{t}`)
+    and new state(:math:`h_{t}`).
+
+    .. math::
+
+        h_{t} & = \mathrm{tanh}(W_{ih}x_{t} + b_{ih} + W_{hh}h{t-1} + b_{hh})
+        y_{t} & = h_{t}
+    
+    where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise 
+    multiplication operator.
+
+    Arguments:
+        input_size (int): The input size for the first layer's cell.
+        hidden_size (int): The hidden size for each layer's cell.
+        num_layers (int): Number of layers. Defaults to 1.
+        nonlinearity (str): The activation in each SimpleRNN cell. It can be 
+            ``tanh`` or ``relu``. Defaults to ``tanh``.
+        direction (str): The direction of the network. It can be "forward", 
+            "backward" and "bidirectional". Defaults to "forward".
+        dropout (float): The droput probability. Dropout is applied to the 
+            input of each layer except for the first layer.
+        time_major (bool): Whether the first dimension of the input means the
+            time steps.
+        weight_ih_attr (ParamAttr, optional): The parameter attribute for 
+            ``weight_ih`` of each cell. Default: None.
+        weight_hh_attr (ParamAttr, optional): The parameter attribute for 
+            ``weight_hh`` of each cell. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            ``bias_ih`` of each cells. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            ``bias_hh`` of each cells. Default: None.
+        name (str, optional): Name for the operation (optional, default is 
+            None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Inputs:
+        inputs (Tensor): the input sequence. 
+            If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`,
+            else, the shape is `[batch_size, time_steps, hidden_size]`.
+        initial_states (Tensor, optional): the initial state. The shape is
+            `[num_lauers * num_directions, batch_size, hidden_size]`. 
+            If initial_state is not given, zero initial states are used.
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
+            or int32. The valid lengths of input sequences.
+            If `sequence_length` is not None, the inputs are treated as 
+            padded sequences. In each input sequence, elements whos time step 
+            index are not less than the valid length are treated as paddings.
+
+    Outputs:
+        (outputs, final_states)
+        outputs (Tensor): the output sequence. 
+            If `time_major` is True, the shape is `[time_steps, batch_size, hidden_size]`,
+            else, the shape is `[batch_size, time_steps, hidden_size]`.
+        final_states (Tensor): final states. The shape is
+            `[num_lauers * num_directions, batch_size, hidden_size]`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+
+            rnn = paddle.nn.SimpleRNN(16, 32, 2)
+
+            x = paddle.randn((4, 23, 16))
+            prev_h = paddle.randn((2, 4, 32))
+            y, h = rnn(x, prev_h)
+
+    """
+
     def __init__(self,
                  input_size,
                  hidden_size,
                  num_layers=1,
+                 nonlinearity="tanh",
                  direction="forward",
                  dropout=0.,
                  time_major=False,
+                 weight_ih_attr=None,
+                 weight_hh_attr=None,
+                 bias_ih_attr=None,
+                 bias_hh_attr=None,
                  name=None):
-        super(LSTM, self).__init__()
+        super(SimpleRNN, self).__init__()
 
         if direction in ["forward", "backward"]:
             is_reverse = direction == "backward"
-            cell = LSTMCell(input_size, hidden_size)
+            cell = SimpleRNNCell(input_size, hidden_size, nonlinearity,
+                                 weight_ih_attr, weight_hh_attr, bias_ih_attr,
+                                 bias_hh_attr)
             self.append(RNN(cell, is_reverse, time_major))
             for i in range(1, num_layers):
-                cell = LSTMCell(hidden_size, hidden_size)
+                cell = SimpleRNNCell(hidden_size, hidden_size, nonlinearity,
+                                     weight_ih_attr, weight_hh_attr,
+                                     bias_ih_attr, bias_hh_attr)
                 self.append(RNN(cell, is_reverse, time_major))
         elif direction == "bidirectional":
-            cell_fw = LSTMCell(input_size, hidden_size)
-            cell_bw = LSTMCell(input_size, hidden_size)
+            cell_fw = SimpleRNNCell(input_size, hidden_size, nonlinearity,
+                                    weight_ih_attr, weight_hh_attr,
+                                    bias_ih_attr, bias_hh_attr)
+            cell_bw = SimpleRNNCell(input_size, hidden_size, nonlinearity,
+                                    weight_ih_attr, weight_hh_attr,
+                                    bias_ih_attr, bias_hh_attr)
             self.append(BiRNN(cell_fw, cell_bw, time_major))
             for i in range(1, num_layers):
-                cell_fw = LSTMCell(2 * hidden_size, hidden_size)
-                cell_bw = LSTMCell(2 * hidden_size, hidden_size)
+                cell_fw = SimpleRNNCell(
+                    2 * hidden_size, hidden_size, nonlinearity, weight_ih_attr,
+                    weight_hh_attr, bias_ih_attr, bias_hh_attr)
+                cell_bw = SimpleRNNCell(
+                    2 * hidden_size, hidden_size, nonlinearity, weight_ih_attr,
+                    weight_hh_attr, bias_ih_attr, bias_hh_attr)
                 self.append(BiRNN(cell_fw, cell_bw, time_major))
         else:
             raise ValueError(
                 "direction should be forward, backward or bidirectional, "
                 "received direction = {}".format(direction))
 
+        self.input_size = input_size
+        self.hidden_size = hidden_size
         self.dropout = dropout
         self.num_directions = 2 if direction == "bidirectional" else 1
         self.time_major = time_major
         self.num_layers = num_layers
+        self.state_components = 1
 
-    def forward(self, inputs, initial_states=None, sequence_length=None):
-        batch_index = 1 if self.time_major else 0
-        batch_size = inputs.shape[batch_index] if fluid.in_dygraph_mode() \
-                     else layers.shape(inputs)[batch_index]
-        if initial_states is None:
-            state_shape = (self.num_directions * self.num_layers, batch_size,
-                           self.hidden_size)
-            init_h = layers.zeros(state_shape, dtype=inputs.dtype)
-            init_c = layers.zeros(state_shape, dtype=inputs.dtype)
-            initial_states = (init_h, init_c)
 
-        states = split_states(initial_states, self.num_directions == 2, 2)
-        final_states = []
-        for i, rnn_layer in enumerate(self):
-            if i > 0:
-                inputs = layers.dropout(
-                    inputs,
-                    self.dropout,
-                    dropout_implementation="upscale_in_train")
-            outputs, final_state = rnn_layer(inputs, states[i], sequence_length)
-            final_states.append(final_state)
-            inputs = outputs
+class LSTM(RNNMixin):
+    r"""
+    Multilayer LSTM. It takes a sequence and an initial state as inputs, and 
+    returns the output sequence and the final state.
 
-        final_states = concat_states(final_states, self.num_directions == 2, 2)
-        return outputs, final_states
+    Each layer inside the LSTM maps the input sequence and initial state 
+    to the output sequence and final state in the following manner: at each 
+    step, it takes step input(:math:`x_{t}`) and previous 
+    state(:math:`h_{t-1}, c_{t-1}`) as inputs, and returns step 
+    output(:math:`y_{t}`) and new state(:math:`h_{t}, c_{t}`).
 
+    .. math::
+
+        i_{t} & = \sigma(W_{ii}x_{t} + b_{ii} + W_{hi}h_{t-1} + b_{hi})
+        f_{t} & = \sigma(W_{if}x_{t} + b_{if} + W_{hf}h_{t-1} + b_{hf})
+        o_{t} & = \sigma(W_{io}x_{t} + b_{io} + W_{ho}h_{t-1} + b_{ho})
+        \\widetilde{c}_{t} & = \\tanh (W_{ig}x_{t} + b_{ig} + W_{hg}h_{t-1} + b_{hg})
+        c_{t} & = f_{t} \* c{t-1} + i{t} \* \\widetile{c}_{t}
+        h_{t} & = o_{t} \* \\tanh(c_{t})
+        y_{t} & = h_{t}
+
+    where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise 
+    multiplication operator.
+
+    Arguments:
+        input_size (int): The input size for the first layer's cell.
+        hidden_size (int): The hidden size for each layer's cell.
+        num_layers (int): Number of layers. Defaults to 1.
+        direction (str): The direction of the network. It can be "forward", 
+            "backward" and "bidirectional". Defaults to "forward".
+        dropout (float): The droput probability. Dropout is applied to the 
+            input of each layer except for the first layer.
+        time_major (bool): Whether the first dimension of the input means the
+            time steps.
+        weight_ih_attr (ParamAttr, optional): The parameter attribute for 
+            ``weight_ih`` of each cell. Default: None.
+        weight_hh_attr (ParamAttr, optional): The parameter attribute for 
+            ``weight_hh`` of each cell. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            ``bias_ih`` of each cells. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            ``bias_hh`` of each cells. Default: None.
+        name (str, optional): Name for the operation (optional, default is 
+            None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Inputs:
+        inputs (Tensor): the input sequence. 
+            If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`,
+            else, the shape is `[batch_size, time_steps, hidden_size]`.
+        initial_states (tuple, optional): the initial state, a tuple of (h, c), 
+            the shape of each is `[num_lauers * num_directions, batch_size, hidden_size]`. 
+            If initial_state is not given, zero initial states are used.
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
+            or int32. The valid lengths of input sequences.
+            If `sequence_length` is not None, the inputs are treated as 
+            padded sequences. In each input sequence, elements whos time step 
+            index are not less than the valid length are treated as paddings.
+
+    Outputs:
+        (outputs, final_states)
+        outputs (Tensor): the output sequence. 
+            If `time_major` is True, the shape is `[time_steps, batch_size, hidden_size]`,
+            else, the shape is `[batch_size, time_steps, hidden_size]`.
+        final_states (Tensor): the final state, a tuple of (h, c), 
+            the shape of each is `[num_lauers * num_directions, batch_size, hidden_size]`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            paddle.disable_static()
+
+            rnn = paddle.nn.LSTM(16, 32, 2)
+
+            x = paddle.randn((4, 23, 16))
+            prev_h = paddle.randn((2, 4, 32))
+            prev_c = paddle.randn((2, 4, 32))
+            y, (h, c) = rnn(x, (prev_h, prev_c))
+
+    """
 
-class GRU(LayerList):
     def __init__(self,
                  input_size,
                  hidden_size,
@@ -799,522 +1070,170 @@ def __init__(self,
                  direction="forward",
                  dropout=0.,
                  time_major=False,
+                 weight_ih_attr=None,
+                 weight_hh_attr=None,
+                 bias_ih_attr=None,
+                 bias_hh_attr=None,
                  name=None):
-        super(GRU, self).__init__()
+        super(LSTM, self).__init__()
 
         if direction in ["forward", "backward"]:
             is_reverse = direction == "backward"
-            cell = GRUCell(input_size, hidden_size)
+            cell = LSTMCell(input_size, hidden_size, weight_ih_attr,
+                            weight_hh_attr, bias_ih_attr, bias_hh_attr)
             self.append(RNN(cell, is_reverse, time_major))
             for i in range(1, num_layers):
-                cell = GRUCell(hidden_size, hidden_size)
+                cell = LSTMCell(hidden_size, hidden_size, weight_ih_attr,
+                                weight_hh_attr, bias_ih_attr, bias_hh_attr)
                 self.append(RNN(cell, is_reverse, time_major))
         elif direction == "bidirectional":
-            cell_fw = GRUCell(input_size, hidden_size)
-            cell_bw = GRUCell(input_size, hidden_size)
+            cell_fw = LSTMCell(input_size, hidden_size, weight_ih_attr,
+                               weight_hh_attr, bias_ih_attr, bias_hh_attr)
+            cell_bw = LSTMCell(input_size, hidden_size, weight_ih_attr,
+                               weight_hh_attr, bias_ih_attr, bias_hh_attr)
             self.append(BiRNN(cell_fw, cell_bw, time_major))
             for i in range(1, num_layers):
-                cell_fw = GRUCell(2 * hidden_size, hidden_size)
-                cell_bw = GRUCell(2 * hidden_size, hidden_size)
+                cell_fw = LSTMCell(2 * hidden_size, hidden_size, weight_ih_attr,
+                                   weight_hh_attr, bias_ih_attr, bias_hh_attr)
+                cell_bw = LSTMCell(2 * hidden_size, hidden_size, weight_ih_attr,
+                                   weight_hh_attr, bias_ih_attr, bias_hh_attr)
                 self.append(BiRNN(cell_fw, cell_bw, time_major))
         else:
             raise ValueError(
                 "direction should be forward, backward or bidirectional, "
                 "received direction = {}".format(direction))
 
+        self.input_size = input_size
+        self.hidden_size = hidden_size
         self.dropout = dropout
         self.num_directions = 2 if direction == "bidirectional" else 1
         self.time_major = time_major
         self.num_layers = num_layers
+        self.state_components = 2
 
-    def forward(self, inputs, initial_states=None, sequence_length=None):
-        batch_index = 1 if self.time_major else 0
-        batch_size = inputs.shape[batch_index] if fluid.in_dygraph_mode() \
-                     else layers.shape(inputs)[batch_index]
-        if initial_states is None:
-            state_shape = (self.num_directions * self.num_layers, batch_size,
-                           self.hidden_size)
-            initial_states = layers.zeros(state_shape, dtype=inputs.dtype)
-        states = split_states(initial_states, self.num_directions == 2)
-
-        final_states = []
-        for i, rnn_layer in enumerate(self):
-            if i > 0:
-                inputs = layers.dropout(
-                    inputs,
-                    self.dropout,
-                    dropout_implementation="upscale_in_train")
-            outputs, final_state = rnn_layer(inputs, states[i], sequence_length)
-            final_states.append(final_state)
-            inputs = outputs
 
-        final_states = concat_states(final_states, self.num_directions == 2)
-        return outputs, final_states
+class GRU(RNNMixin):
+    r"""
+    Multilayer GRU. It takes a sequence and an initial state as inputs, and 
+    returns the output sequence and the final state.
 
+    Each layer inside the GRU maps the input sequence and initial state 
+    to the output sequence and final state in the following manner: at each 
+    step, it takes step input(:math:`x_{t}`) and previous 
+    state(:math:`h_{t-1}`) as inputs, and returns step output(:math:`y_{t}`) 
+    and new state(:math:`h_{t}`).
 
-# class LSTM(Layer):
-#     """
-#     Applies a stacked multi-layer long short-term memory (LSTM) RNN to an input
-#     sequence.
-
-#     The formula for LSTM used here is as follows:
-
-#     .. math::
-#         i_{t} & = \sigma(W_{x_{i}}x_{t} + b_{x_{i}} + W_{h_{i}}h_{t-1} + b_{h_{i}})
-#         f_{t} & = \sigma(W_{x_{f}}x_{t} + b_{x_{f}} + W_{h_{f}}h_{t-1} + b_{h_{f}})
-#         o_{t} & = \sigma(W_{x_{o}}x_{t} + b_{x_{o}} + W_{h_{o}}h_{t-1} + b_{h_{o}})
-#         c_{t} & = f_{t}c_{t-1} + i_{t} \\tanh (W_{x_{c}}x_{t} + b_{x_{c}} + W_{h_{c}}h_{t-1} + b_{h_{c}})
-#         h_{t} & = o_{t} \\tanh (c_{t})
-
-#     Parameters:
-#         input_size (int): The input feature size for the first LSTM.
-#         hidden_size (int): The hidden size for every LSTM.
-#         num_layers(int, optional): The number of LSTM to be stacked. Default 1.
-#         dropout(float, optional): The dropout probability applied on the outputs
-#             of each LSTM except the last one. 0 for not dropout. Default 0.0
-#         direction (str, optional): Indicate the direction for LSTM calculation
-#             applying on the input sequences. It can be `forward`, `backward` or
-#             `bidirect`. If it is `backward`, calculate in the reverse order of
-#             input sequences. If it is `bidirect`, each layer would be a 
-#             bidirectional LSTM composed of a `forward` LSTM and `backward` LSTM,
-#             and it concatenates their outputs as outputs. Default: `forward`.
-#         time_major (bool, optional): Indicate the data layout of Tensor included
-#             in `input` and `output` tensors. If `False`, the data layout would
-#             be batch major with shape `[batch_size, sequence_length, ...]`.  If
-#             `True`, the data layout would be time major with shape
-#             `[sequence_length, batch_size, ...]`. Default: `False`.
-#         param_attr (list|tuple|ParamAttr): A list, tuple or something can be
-#             converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
-#             a list or tuple, it's length must equal to `num_layers`. Otherwise,
-#             construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
-#             Default None.
-#         bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
-#             converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
-#             a list or tuple, it's length must equal to `num_layers`. Otherwise,
-#             construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
-#             Default None.
-#         dtype(string, optional): The data type used in this cell. It can be
-#             float32 or float64. Default float32.
-#     Examples:
-#         .. code-block:: python
-#             import paddle
-#             import paddle.fluid as fluid
-#             from paddle.incubate.hapi.text import LSTM
-#             inputs = paddle.rand((2, 4, 32))
-#             lstm = LSTM(input_size=32, hidden_size=64, num_layers=2)
-#             outputs, _ = lstm(inputs)  # [2, 4, 64]
-#     """
-
-#     def __init__(self,
-#                  input_size,
-#                  hidden_size,
-#                  num_layers=1,
-#                  direction="forward",
-#                  dropout=0.0,
-#                  time_major=False,
-#                  name=None):
-#         super(LSTM, self).__init__()
-#         self.input_size = input_size
-#         self.hidden_size = hidden_size
-#         self.num_layers = num_layers
-#         self.dropout = dropout
-#         self.direction = direction
-#         self.num_directions = 2 if direction == 'bidirect' else 1
-#         self.time_major = time_major
-
-#         if direction == 'bidirect':
-#             param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr)
-#             bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr)
-#             fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0],
-#                                                              num_layers)
-#             bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1],
-#                                                              num_layers)
-#             fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0],
-#                                                             num_layers)
-#             bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1],
-#                                                             num_layers)
-
-#             # maybe design cell including both forward and backward later
-#             merge_mode = 'concat'
-#             rnns = []
-#             for i in range(num_layers):
-#                 cell_fw = StackedLSTMCell(input_size if i == 0 else (
-#                     hidden_size * 2 if merge_mode == 'concat' else
-#                     hidden_size), hidden_size, 1, dropout, fw_param_attrs[i],
-#                                           fw_bias_attrs[i], dtype)
-#                 cell_bw = StackedLSTMCell(input_size if i == 0 else (
-#                     hidden_size * 2 if merge_mode == 'concat' else
-#                     hidden_size), hidden_size, 1, dropout, bw_param_attrs[i],
-#                                           bw_bias_attrs[i], dtype)
-#                 rnns.append(
-#                     BidirectionalRNN(
-#                         cell_fw,
-#                         cell_bw,
-#                         merge_mode=merge_mode,
-#                         time_major=time_major))
-#             self.lstm = LayerList(rnns)
-#         else:
-#             lstm_cell = StackedLSTMCell(input_size, hidden_size, num_layers,
-#                                         dropout, param_attr, bias_attr, dtype)
-#             self.lstm = RNN(lstm_cell,
-#                             is_reverse=(direction == "backward"),
-#                             time_major=time_major)
-
-#     def forward(self, input, initial_states=None, sequence_length=None):
-#         """
-#         Performs the stacked multi-layer LSTM layer by layer. Each LSTM's `outputs`
-#         is the `inputs` of the subsequent one.
-#         Parameters:
-#             inputs (Variable): The inputs for the first LSTM. It is a float32
-#                 or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
-#             initial_states (list|None, optional): A list containing initial states 
-#                 of all stacked LSTM, and the initial states of each LSTM is a pair
-#                 of tensors shaped `[batch_size, hidden_size]`. If not provided,
-#                 use 0 as initial states. Default None.
-#             sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
-#                 It stores real length of each instance, thus enables users to extract
-#                 the last valid state when past a batch element's sequence length for
-#                 correctness. If not provided, the paddings would be treated same as
-#                 non-padding inputs. Default None.
-#         Returns:
-#             tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
-#                 is the output of last LSTM and it is a tensor with shape \
-#                 `[batch_size, sequence_length, hidden_size]` and has the same \
-#                 data type as `inputs`, `final_states` is the counterpart of \
-#                 `initial_states` at last time step, thus has the same structure \
-#                 with it and has tensors with same shapes data types. 
-#         """
-#         if not isinstance(self.lstm, LayerList):
-#             return self.lstm(input, initial_states, sequence_length)
-#         else:
-#             if isinstance(initial_states, (list, tuple)):
-#                 assert len(initial_states) == self.num_layers, (
-#                     "length of initial_states should be %d when it is a list|tuple"
-#                     % self.num_layers)
-#             else:
-#                 initial_states = [initial_states] * self.num_layers
-#             stacked_states = []
-#             for i in range(self.num_layers):
-#                 output, states = self.lstm[i](input, initial_states[i],
-#                                               sequence_length)
-#                 input = output
-#                 stacked_states.append(states)
-#             return output, stacked_states
-
-
-# TODO: restucture RNN layers
-class StackedRNNCell(RNNCellBase):
-    """
-    Wrapper allowing a stack of RNN cells to behave as a single cell. It is used
-    to implement stacked RNNs.
+    .. math::
 
-    Parameters:
-        cells (list|tuple): List of RNN cell instances.
+        r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}x_{t} + b_{hr})
+        z_{t} & = \sigma(W_{iz)x_{t} + b_{iz} + W_{hz}x_{t} + b_{hz})
+        \\widetilde{h}_{t} & = \\tanh(W_{ic)x_{t} + b_{ic} + r_{t} \* (W_{hc}x_{t} + b{hc}))
+        h_{t} & = z_{t} \* h_{t-1} + (1 - z_{t}) \* \\widetilde{h}_{t}
+        y_{t} & = h_{t}
+
+    where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise 
+    multiplication operator.
+
+    Arguments:
+        input_size (int): The input size for the first layer's cell.
+        hidden_size (int): The hidden size for each layer's cell.
+        num_layers (int): Number of layers. Defaults to 1.
+        direction (str): The direction of the network. It can be "forward", 
+            "backward" and "bidirectional". Defaults to "forward".
+        dropout (float): The droput probability. Dropout is applied to the 
+            input of each layer except for the first layer.
+        time_major (bool): Whether the first dimension of the input means the
+            time steps.
+        weight_ih_attr (ParamAttr, optional): The parameter attribute for 
+            ``weight_ih`` of each cell. Default: None.
+        weight_hh_attr (ParamAttr, optional): The parameter attribute for 
+            ``weight_hh`` of each cell. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            ``bias_ih`` of each cells. Default: None.
+        bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
+            ``bias_hh`` of each cells. Default: None.
+        name (str, optional): Name for the operation (optional, default is 
+            None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Inputs:
+        inputs (Tensor): the input sequence. 
+            If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`,
+            else, the shape is `[batch_size, time_steps, hidden_size]`.
+        initial_states (Tensor, optional): the initial state. The shape is
+            `[num_lauers * num_directions, batch_size, hidden_size]`. 
+            If initial_state is not given, zero initial states are used.
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
+            or int32. The valid lengths of input sequences.
+            If `sequence_length` is not None, the inputs are treated as 
+            padded sequences. In each input sequence, elements whos time step 
+            index are not less than the valid length are treated as paddings.
+
+    Outputs:
+        (outputs, final_states)
+        outputs (Tensor): the output sequence. 
+            If `time_major` is True, the shape is `[time_steps, batch_size, hidden_size]`,
+            else, the shape is `[batch_size, time_steps, hidden_size]`.
+        final_states (Tensor): final states. The shape is
+            `[num_lauers * num_directions, batch_size, hidden_size]`.
 
     Examples:
         .. code-block:: python
-            from paddle import LSTMCell, StackedRNNCell
-            cells = [LSTMCell(32, 32), LSTMCell(32, 32)]
-            stack_rnn = StackedRNNCell(cells)
-    """
-
-    def __init__(self, cells):
-        super(StackedRNNCell, self).__init__()
-        self.cells = LayerList(cells)
-
-    def forward(self, inputs, states):
-        """
-        Performs :code:`cell.forward` for all including cells sequentially.
-        Each cell's `inputs` is the `outputs` of the previous cell. And each
-        cell's `states` is the corresponding one in `states`.
 
-        Parameters:
-            inputs (Variable): The inputs for the first cell. Mostly it is a
-                float32 or float64 tensor with shape `[batch_size, input_size]`.
-            states (list): A list containing states for all cells orderly.
-
-        Returns:
-            tuple: A tuple( :code:`(outputs, new_states)` ). `outputs` is the \
-                `outputs` of the last cell. `new_states` is a list composed \
-                of all cells' `new_states`, and its structure and data type is \
-                same as that of `states` argument.
-        """
-        new_states = []
-        for cell, state in zip(self.cells, states):
-            outputs, new_state = cell(inputs, state)
-            inputs = outputs
-            new_states.append(new_state)
-        return outputs, new_states
-
-    @staticmethod
-    def stack_param_attr(param_attr, n):
-        """
-        If `param_attr` is a list or tuple, convert every element in it to a
-        ParamAttr instance. Otherwise, repeat `param_attr` `n` times to
-        construct a list, and rename every one by appending a increasing index
-        suffix to avoid having same names when `param_attr` contains a name.
-        Parameters:
-            param_attr (list|tuple|ParamAttr): A list, tuple or something can be
-                converted to a ParamAttr instance by `ParamAttr._to_attr`.
-            n (int): The times to repeat to construct a list when `param_attr`
-                is not a list or tuple.
-        Returns:
-            list: A list composed of each including cell's `param_attr`.
-        """
-        if isinstance(param_attr, (list, tuple)):
-            assert len(param_attr) == n, (
-                "length of param_attr should be %d when it is a list/tuple" % n)
-            param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr]
-        else:
-            param_attrs = []
-            attr = ParamAttr._to_attr(param_attr)
-            for i in range(n):
-                attr_i = copy.deepcopy(attr)
-                if attr.name:
-                    attr_i.name = attr_i.name + "_" + str(i)
-                param_attrs.append(attr_i)
-        return param_attrs
-
-    @property
-    def state_shape(self):
-        """
-        The `state_shape` of StackedRNNCell is a list composed of each including
-        cell's `state_shape`.
-        Returns:
-            list: A list composed of each including cell's `state_shape`.
-        """
-        return [cell.state_shape for cell in self.cells]
-
-
-class StackedLSTMCell(RNNCellBase):
-    """
-    Wrapper allowing a stack of LSTM cells to behave as a single cell. It is used
-    to implement stacked LSTM.
-
-    The formula for LSTM used here is as follows:
+            import paddle
+            paddle.disable_static()
 
-    .. math::
-        i_{t} & = \sigma(W_{x_{i}}x_{t} + b_{x_{i}} + W_{h_{i}}h_{t-1} + b_{h_{i}})
-        f_{t} & = \sigma(W_{x_{f}}x_{t} + b_{x_{f}} + W_{h_{f}}h_{t-1} + b_{h_{f}})
-        o_{t} & = \sigma(W_{x_{o}}x_{t} + b_{x_{o}} + W_{h_{o}}h_{t-1} + b_{h_{o}})
-        c_{t} & = f_{t}c_{t-1} + i_{t} \\tanh (W_{x_{c}}x_{t} + b_{x_{c}} + W_{h_{c}}h_{t-1} + b_{h_{c}})
-        h_{t} & = o_{t} \\tanh (c_{t})
+            rnn = paddle.nn.GRU(16, 32, 2)
 
-    Parameters:
-        input_size (int): The input size for the first LSTM cell.
-        hidden_size (int): The hidden size for every LSTM cell.
-        num_layers(int, optional): The number of LSTM to be stacked. Default 1.
-        dropout(float, optional): The dropout probability applied on the outputs
-            of each LSTM cell except the last one. 0 for no dropout. Default 0.0
-        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
-            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
-            a list or tuple, it's length must equal to `num_layers`. Otherwise,
-            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
-            Default None.
-        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
-            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
-            a list or tuple, it's length must equal to `num_layers`. Otherwise,
-            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
-            Default None.
-        dtype(string, optional): The data type used in this cell. It can be
-            float32 or float64. Default float32.
+            x = paddle.randn((4, 23, 16))
+            prev_h = paddle.randn((2, 4, 32))
+            y, h = rnn(x, prev_h)
 
-    Examples:
-        .. code-block:: python
-            import paddle
-            inputs = paddle.rand((2, 4, 32))
-            cell = paddle.StackedLSTMCell(input_size=32, hidden_size=64)
-            rnn = paddle.RNN(cell=cell)
-            outputs, _ = rnn(inputs)  # [2, 4, 64]
     """
 
     def __init__(self,
                  input_size,
                  hidden_size,
                  num_layers=1,
-                 dropout=0.0,
-                 param_attr=None,
-                 bias_attr=None,
-                 dtype="float32"):
-        super(StackedLSTMCell, self).__init__()
-        self.hidden_size = hidden_size
-        self.input_size = input_size
-        self.num_layers = num_layers
-        self.dropout = dropout
-        param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers)
-        bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers)
-
-        self.cells = []
-        for i in range(num_layers):
-            self.cells.append(
-                self.add_sublayer(
-                    "lstm_%d" % i,
-                    LSTMCell(
-                        input_size=input_size if i == 0 else hidden_size,
-                        hidden_size=hidden_size,
-                        param_attr=param_attrs[i],
-                        bias_attr=bias_attrs[i],
-                        dtype=dtype)))
-
-    def forward(self, inputs, states):
-        """
-        Performs the stacked LSTM cells sequentially. Each cell's `inputs` is
-        the `outputs` of the previous cell. And each cell's `states` is the
-        corresponding one in `states`.
-
-        Parameters:
-            inputs (Variable): The inputs for the first cell. It is a float32 or
-                float64 tensor with shape `[batch_size, input_size]`.
-            states (list): A list containing states for all cells orderly.
-
-        Returns:
-            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
-                a tensor with shape `[batch_size, hidden_size]`, corresponding \
-                to :math:`h_{t}` in the formula of the last LSTM; `new_states` \
-                is a list composed of every LSTM `new_states` which is a pair \
-                of tensors standing for :math:`h_{t}, c_{t}` in the formula, \
-                and the data type and structure of these tensors all is same \
-                as that of `states`.
-        """
-        new_states = []
-        for i, cell in enumerate(self.cells):
-            outputs, new_state = cell(inputs, states[i])
-            outputs = layers.dropout(
-                outputs,
-                self.dropout,
-                dropout_implementation='upscale_in_train'
-            ) if self.dropout and i != (self.num_layers - 1) else outputs
-            inputs = outputs
-            new_states.append(new_state)
-        # TODO(guosheng): maybe should stack list of states as one tensor
-        return outputs, new_states
-
-    @property
-    def state_shape(self):
-        """
-        The `state_shape` of StackedLSTMCell is a list composed of each including
-        LSTM cell's `state_shape`.
-        Returns:
-            list: A list composed of each including LSTM cell's `state_shape`.
-        """
-        return [cell.state_shape for cell in self.cells]
-
-
-class StackedGRUCell(RNNCellBase):
-    """
-    Wrapper allowing a stack of GRU cells to behave as a single cell. It is used
-    to implement stacked GRU.
-
-    The formula for GRU used here is as follows:
-
-    .. math::
-
-        u_t & = \sigma(W_{x_{u}}x_{t} + b_{x_{u}} + W_{h_{u}}h_{t-1} + b_{h_{u}})
-
-        r_t & = \sigma(W_{x_{r}}x_{t} + b_{x_{r}} + W_{h_{r}}h_{t-1} + b_{h_{r}})
-
-        \\tilde{h_t} & = \\tanh(W_{x_{c}}x_{t} + r_t \odot (W_{h_{c}}h_{t-1} + b_{h_{c}})
-
-        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
-
-
-    Parameters:
-        input_size (int): The input size for the first GRU cell.
-        hidden_size (int): The hidden size for every GRU cell.
-        num_layers(int, optional): The number of GRU to be stacked. Default 1.
-        dropout(float, optional): The dropout probability applied on the outputs
-            of each GRU cell except the last one. 0 for no dropout. Default 0.0
-        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
-            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
-            a list or tuple, it's length must equal to `num_layers`. Otherwise,
-            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
-            Default None.
-        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
-            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
-            a list or tuple, it's length must equal to `num_layers`. Otherwise,
-            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
-            Default None.
-        dtype(string, optional): The data type used in this cell. It can be
-            float32 or float64. Default float32.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
+                 direction="forward",
+                 dropout=0.,
+                 time_major=False,
+                 weight_ih_attr=None,
+                 weight_hh_attr=None,
+                 bias_ih_attr=None,
+                 bias_hh_attr=None,
+                 name=None):
+        super(GRU, self).__init__()
 
-            inputs = paddle.rand((2, 4, 32))
-            cell = paddle.StackedGRUCell(input_size=32, hidden_size=64)
-            rnn = paddle.RNN(cell=cell)
-            outputs, _ = rnn(inputs)  # [2, 4, 64]
-    """
+        if direction in ["forward", "backward"]:
+            is_reverse = direction == "backward"
+            cell = GRUCell(input_size, hidden_size, weight_ih_attr,
+                           weight_hh_attr, bias_ih_attr, bias_hh_attr)
+            self.append(RNN(cell, is_reverse, time_major))
+            for i in range(1, num_layers):
+                cell = GRUCell(hidden_size, hidden_size, weight_ih_attr,
+                               weight_hh_attr, bias_ih_attr, bias_hh_attr)
+                self.append(RNN(cell, is_reverse, time_major))
+        elif direction == "bidirectional":
+            cell_fw = GRUCell(input_size, hidden_size, weight_ih_attr,
+                              weight_hh_attr, bias_ih_attr, bias_hh_attr)
+            cell_bw = GRUCell(input_size, hidden_size, weight_ih_attr,
+                              weight_hh_attr, bias_ih_attr, bias_hh_attr)
+            self.append(BiRNN(cell_fw, cell_bw, time_major))
+            for i in range(1, num_layers):
+                cell_fw = GRUCell(2 * hidden_size, hidden_size, weight_ih_attr,
+                                  weight_hh_attr, bias_ih_attr, bias_hh_attr)
+                cell_bw = GRUCell(2 * hidden_size, hidden_size, weight_ih_attr,
+                                  weight_hh_attr, bias_ih_attr, bias_hh_attr)
+                self.append(BiRNN(cell_fw, cell_bw, time_major))
+        else:
+            raise ValueError(
+                "direction should be forward, backward or bidirectional, "
+                "received direction = {}".format(direction))
 
-    def __init__(self,
-                 input_size,
-                 hidden_size,
-                 num_layers=1,
-                 dropout=0.0,
-                 param_attr=None,
-                 bias_attr=None,
-                 dtype="float32"):
-        super(StackedGRUCell, self).__init__()
-        self.hidden_size = hidden_size
         self.input_size = input_size
-        self.num_layers = num_layers
+        self.hidden_size = hidden_size
         self.dropout = dropout
-        param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers)
-        bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers)
-
-        self.cells = []
-        for i in range(num_layers):
-            self.cells.append(
-                self.add_sublayer(
-                    "gru_%d" % i,
-                    GRUCell(
-                        input_size=input_size if i == 0 else hidden_size,
-                        hidden_size=hidden_size,
-                        param_attr=param_attrs[i],
-                        bias_attr=bias_attrs[i],
-                        dtype=dtype)))
-
-    def forward(self, inputs, states):
-        """
-        Performs the stacked GRU cells sequentially. Each cell's `inputs` is
-        the `outputs` of the previous cell. And each cell's `states` is the
-        corresponding one in `states`.
-
-        Parameters:
-            inputs (Variable): The inputs for the first cell. It is a float32 or
-                float64 tensor with shape `[batch_size, input_size]`.
-            states (list): A list containing states for all cells orderly.
-
-        Returns:
-            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
-                a tensor with shape `[batch_size, hidden_size]`, corresponding \
-                to :math:`h_{t}` in the formula of the last GRU; `new_states` \
-                is a list composed of every GRU `new_states` which is also \
-                :math:`h_{t}` in the formula, and the data type and structure \
-                of these tensors all is same as that of `states`.
-        """
-        new_states = []
-        for i, cell in enumerate(self.cells):
-            outputs, new_state = cell(inputs, states[i])
-            outputs = layers.dropout(
-                outputs,
-                self.dropout,
-                dropout_implementation='upscale_in_train'
-            ) if self.dropout and i != (self.num_layers - 1) else outputs
-            inputs = outputs
-            new_states.append(new_state)
-        return outputs, new_states
-
-    @property
-    def state_shape(self):
-        """
-        The `state_shape` of StackedGRUCell is a list composed of each including
-        GRU cell's `state_shape`.
-
-        Returns:
-            list: A list composed of each including GRU cell's `state_shape`.
-        """
-        return [cell.state_shape for cell in self.cells]
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.time_major = time_major
+        self.num_layers = num_layers
+        self.state_components = 1

From 156b490b7ccf3dfbbd722a1dce06464a76e872be Mon Sep 17 00:00:00 2001
From: chenfeiyu <chenfeiyu@baidu.com>
Date: Mon, 24 Aug 2020 17:37:41 +0800
Subject: [PATCH 05/14] add unittets

---
 .../fluid/tests/unittests/CMakeLists.txt      |   1 +
 .../fluid/tests/unittests/rnn/CMakeLists.txt  |   6 +
 .../fluid/tests/unittests/rnn/__init__.py     |  13 +
 .../fluid/tests/unittests/rnn/convert.py      |  51 ++
 .../fluid/tests/unittests/rnn/rnn_numpy.py    | 515 ++++++++++++++++++
 .../tests/unittests/rnn/test_rnn_cells.py     | 164 ++++++
 .../unittests/rnn/test_rnn_cells_static.py    | 327 +++++++++++
 .../tests/unittests/rnn/test_rnn_nets.py      | 269 +++++++++
 .../unittests/rnn/test_rnn_nets_static.py     | 468 ++++++++++++++++
 python/paddle/nn/layer/rnn.py                 |   4 +-
 10 files changed, 1815 insertions(+), 3 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt
 create mode 100644 python/paddle/fluid/tests/unittests/rnn/__init__.py
 create mode 100644 python/paddle/fluid/tests/unittests/rnn/convert.py
 create mode 100644 python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py
 create mode 100644 python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py
 create mode 100644 python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py
 create mode 100644 python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
 create mode 100644 python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 33d9326681d09..e601c5a080172 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -476,6 +476,7 @@ endif()
 
 add_subdirectory(sequence)
 add_subdirectory(dygraph_to_static)
+add_subdirectory(rnn)
 
 if (WITH_MKLDNN)
     add_subdirectory(mkldnn)
diff --git a/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt b/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt
new file mode 100644
index 0000000000000..f71e04c09aa38
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt
@@ -0,0 +1,6 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+foreach(TEST_OP ${TEST_OPS})
+    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+endforeach(TEST_OP)
diff --git a/python/paddle/fluid/tests/unittests/rnn/__init__.py b/python/paddle/fluid/tests/unittests/rnn/__init__.py
new file mode 100644
index 0000000000000..abf198b97e6e8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/fluid/tests/unittests/rnn/convert.py b/python/paddle/fluid/tests/unittests/rnn/convert.py
new file mode 100644
index 0000000000000..02f10694a4b47
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/convert.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+
+
+def convert_params_for_cell(np_cell, paddle_cell):
+    state = np_cell.parameters
+    for k, v in paddle_cell.named_parameters():
+        v.set_value(state[k])
+
+
+def convert_params_for_cell_static(np_cell, paddle_cell, place):
+    state = np_cell.parameters
+    for k, v in paddle_cell.named_parameters():
+        scope = paddle.static.global_scope()
+        tensor = scope.find_var(v.name).get_tensor()
+        tensor.set(state[k], place)
+
+
+def convert_params_for_net(np_net, paddle_net):
+    for np_layer, paddle_layer in zip(np_net, paddle_net):
+        if hasattr(np_layer, "cell"):
+            convert_params_for_cell(np_layer.cell, paddle_layer.cell)
+        else:
+            convert_params_for_cell(np_layer.cell_fw, paddle_layer.cell_fw)
+            convert_params_for_cell(np_layer.cell_bw, paddle_layer.cell_bw)
+
+
+def convert_params_for_net_static(np_net, paddle_net, place):
+    for np_layer, paddle_layer in zip(np_net, paddle_net):
+        if hasattr(np_layer, "cell"):
+            convert_params_for_cell_static(np_layer.cell, paddle_layer.cell,
+                                           place)
+        else:
+            convert_params_for_cell_static(np_layer.cell_fw,
+                                           paddle_layer.cell_fw, place)
+            convert_params_for_cell_static(np_layer.cell_bw,
+                                           paddle_layer.cell_bw, place)
diff --git a/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py b/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py
new file mode 100644
index 0000000000000..725d7df2df3a5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py
@@ -0,0 +1,515 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import math
+
+
+class LayerMixin(object):
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+
+
+class LayerListMixin(LayerMixin):
+    def __init__(self, layers=None):
+        self._layers = list(layers) if layers else []
+
+    def append(self, layer):
+        self._layers.append(layer)
+
+    def __iter__(self):
+        return iter(self._layers)
+
+
+class SimpleRNNCell(LayerMixin):
+    def __init__(self, input_size, hidden_size, bias=True, nonlinearity="tanh"):
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.bias = bias
+        if nonlinearity == 'tanh':
+            self.nonlinearity = np.tanh
+        else:
+            self.nonlinearity = lambda x: np.maximum(x, 0.)
+
+        self.parameters = dict()
+        std = 1.0 / math.sqrt(hidden_size)
+        self.weight_ih = np.random.uniform(-std, std, (
+            hidden_size, input_size)).astype('float64')
+        self.weight_hh = np.random.uniform(-std, std, (
+            hidden_size, hidden_size)).astype('float64')
+        self.parameters['weight_ih'] = self.weight_ih
+        self.parameters['weight_hh'] = self.weight_hh
+        if bias:
+            self.bias_ih = np.random.uniform(-std, std,
+                                             (hidden_size, )).astype('float64')
+            self.bias_hh = np.random.uniform(-std, std,
+                                             (hidden_size, )).astype('float64')
+            self.parameters['bias_ih'] = self.bias_ih
+            self.parameters['bias_hh'] = self.bias_hh
+        else:
+            self.bias_ih = None
+            self.bias_hh = None
+
+    def init_state(self, inputs):
+        batch_size = inputs.shape[0]
+        return np.zeros((batch_size, self.hidden_size), dtype=inputs.dtype)
+
+    def forward(self, inputs, hx=None):
+        if hx is None:
+            hx = self.init_state(inputs)
+        pre_h = hx
+        i2h = np.matmul(inputs, self.weight_ih.T)
+        if self.bias_ih is not None:
+            i2h += self.bias_ih
+        h2h = np.matmul(pre_h, self.weight_hh.T)
+        if self.bias_hh is not None:
+            h2h += self.bias_hh
+        h = self.nonlinearity(i2h + h2h)
+        return h, h
+
+
+class GRUCell(LayerMixin):
+    def __init__(self, input_size, hidden_size, bias=True):
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.bias = bias
+        self.parameters = dict()
+        std = 1.0 / math.sqrt(hidden_size)
+        self.weight_ih = np.random.uniform(-std, std, (
+            3 * hidden_size, input_size)).astype('float64')
+        self.weight_hh = np.random.uniform(-std, std, (
+            3 * hidden_size, hidden_size)).astype('float64')
+        self.parameters['weight_ih'] = self.weight_ih
+        self.parameters['weight_hh'] = self.weight_hh
+        if bias:
+            self.bias_ih = np.random.uniform(-std, std, (
+                3 * hidden_size)).astype('float64')
+            self.bias_hh = np.random.uniform(-std, std, (
+                3 * hidden_size)).astype('float64')
+            self.parameters['bias_ih'] = self.bias_ih
+            self.parameters['bias_hh'] = self.bias_hh
+        else:
+            self.bias_ih = None
+            self.bias_hh = None
+
+    def init_state(self, inputs):
+        batch_size = inputs.shape[0]
+        return np.zeros((batch_size, self.hidden_size), dtype=inputs.dtype)
+
+    def forward(self, inputs, hx=None):
+        if hx is None:
+            hx = self.init_state(inputs)
+        pre_hidden = hx
+        x_gates = np.matmul(inputs, self.weight_ih.T)
+        if self.bias_ih is not None:
+            x_gates = x_gates + self.bias_ih
+        h_gates = np.matmul(pre_hidden, self.weight_hh.T)
+        if self.bias_hh is not None:
+            h_gates = h_gates + self.bias_hh
+
+        x_r, x_z, x_c = np.split(x_gates, 3, 1)
+        h_r, h_z, h_c = np.split(h_gates, 3, 1)
+
+        r = 1.0 / (1.0 + np.exp(-(x_r + h_r)))
+        z = 1.0 / (1.0 + np.exp(-(x_z + h_z)))
+        c = np.tanh(x_c + r * h_c)  # apply reset gate after mm
+        h = (pre_hidden - c) * z + c
+        return h, h
+
+
+class LSTMCell(LayerMixin):
+    def __init__(self, input_size, hidden_size, bias=True):
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.bias = bias
+        self.parameters = dict()
+        std = 1.0 / math.sqrt(hidden_size)
+        self.weight_ih = np.random.uniform(-std, std, (
+            4 * hidden_size, input_size)).astype('float64')
+        self.weight_hh = np.random.uniform(-std, std, (
+            4 * hidden_size, hidden_size)).astype('float64')
+        self.parameters['weight_ih'] = self.weight_ih
+        self.parameters['weight_hh'] = self.weight_hh
+        if bias:
+            self.bias_ih = np.random.uniform(-std, std, (
+                4 * hidden_size)).astype('float64')
+            self.bias_hh = np.random.uniform(-std, std, (
+                4 * hidden_size)).astype('float64')
+            self.parameters['bias_ih'] = self.bias_ih
+            self.parameters['bias_hh'] = self.bias_hh
+        else:
+            self.bias_ih = None
+            self.bias_hh = None
+
+    def init_state(self, inputs):
+        batch_size = inputs.shape[0]
+        init_h = np.zeros((batch_size, self.hidden_size), dtype=inputs.dtype)
+        init_c = np.zeros((batch_size, self.hidden_size), dtype=inputs.dtype)
+        return init_h, init_c
+
+    def forward(self, inputs, hx=None):
+        if hx is None:
+            hx = self.init_state(inputs)
+        pre_hidden, pre_cell = hx
+        gates = np.matmul(inputs, self.weight_ih.T)
+        if self.bias_ih is not None:
+            gates = gates + self.bias_ih
+        gates += np.matmul(pre_hidden, self.weight_hh.T)
+        if self.bias_hh is not None:
+            gates = gates + self.bias_hh
+
+        chunked_gates = np.split(gates, 4, -1)
+
+        i = 1.0 / (1.0 + np.exp(-chunked_gates[0]))
+        f = 1.0 / (1.0 + np.exp(-chunked_gates[1]))
+        o = 1.0 / (1.0 + np.exp(-chunked_gates[3]))
+        c = f * pre_cell + i * np.tanh(chunked_gates[2])
+        h = o * np.tanh(c)
+
+        return h, (h, c)
+
+
+def sequence_mask(lengths, max_len=None):
+    if max_len is None:
+        max_len = np.max(lengths)
+    else:
+        assert max_len >= np.max(lengths)
+    return np.arange(max_len) < np.expand_dims(lengths, -1)
+
+
+def update_state(mask, new, old):
+    if not isinstance(old, (tuple, list)):
+        return np.where(mask, new, old)
+    else:
+        return tuple(map(lambda x, y: np.where(mask, x, y), new, old))
+
+
+def rnn(cell,
+        inputs,
+        initial_states,
+        sequence_length=None,
+        time_major=False,
+        is_reverse=False):
+    if not time_major:
+        inputs = np.transpose(inputs, [1, 0, 2])
+    if is_reverse:
+        inputs = np.flip(inputs, 0)
+
+    if sequence_length is None:
+        mask = None
+    else:
+        mask = np.transpose(sequence_mask(sequence_length), [1, 0])
+        mask = np.expand_dims(mask, -1)
+        if is_reverse:
+            mask = np.flip(mask, 0)
+
+    time_steps = inputs.shape[0]
+    state = initial_states
+    outputs = []
+    for t in range(time_steps):
+        x_t = inputs[t]
+        if mask is not None:
+            m_t = mask[t]
+            y, new_state = cell(x_t, state)
+            y = np.where(m_t, y, 0.)
+            outputs.append(y)
+            state = update_state(m_t, new_state, state)
+        else:
+            y, new_state = cell(x_t, state)
+            outputs.append(y)
+            state = new_state
+
+    outputs = np.stack(outputs)
+    final_state = state
+
+    if is_reverse:
+        outputs = np.flip(outputs, 0)
+    if not time_major:
+        outputs = np.transpose(outputs, [1, 0, 2])
+    return outputs, final_state
+
+
+def birnn(cell_fw,
+          cell_bw,
+          inputs,
+          initial_states,
+          sequence_length=None,
+          time_major=False):
+    states_fw, states_bw = initial_states
+    outputs_fw, states_fw = rnn(cell_fw,
+                                inputs,
+                                states_fw,
+                                sequence_length,
+                                time_major=time_major)
+
+    outputs_bw, states_bw = rnn(cell_bw,
+                                inputs,
+                                states_bw,
+                                sequence_length,
+                                time_major=time_major,
+                                is_reverse=True)
+
+    outputs = np.concatenate((outputs_fw, outputs_bw), -1)
+    final_states = (states_fw, states_bw)
+    return outputs, final_states
+
+
+def flatten(nested):
+    return list(_flatten(nested))
+
+
+def _flatten(nested):
+    for item in nested:
+        if isinstance(item, (list, tuple)):
+            yield from _flatten(item)
+        else:
+            yield item
+
+
+def unstack(array, axis=0):
+    num = array.shape[axis]
+    sub_arrays = np.split(array, num, axis)
+    return [np.squeeze(sub_array, axis) for sub_array in sub_arrays]
+
+
+def dropout(array, p=0.5):
+    if p == 0.0:
+        return array
+
+    mask = (np.random.uniform(size=array.shape) < (1 - p)).astype(array.dtype)
+    return array * (mask / (1 - p))
+
+
+def split_states(states, bidirectional=False, state_components=1):
+    if state_components == 1:
+        states = unstack(states)
+        if not bidirectional:
+            return states
+        else:
+            return list(zip(states[::2], states[1::2]))
+    else:
+        assert len(states) == state_components
+        states = tuple([unstack(item) for item in states])
+        if not bidirectional:
+            return list(zip(*states))
+        else:
+            states = list(zip(*states))
+            return list(zip(states[::2], states[1::2]))
+
+
+def concat_states(states, bidirectional=False, state_components=1):
+    if state_components == 1:
+        return np.stack(flatten(states))
+    else:
+        states = flatten(states)
+        componnets = []
+        for i in range(state_components):
+            componnets.append(states[i::state_components])
+        return [np.stack(item) for item in componnets]
+
+
+class RNN(LayerMixin):
+    def __init__(self, cell, is_reverse=False, time_major=False):
+        super(RNN, self).__init__()
+        self.cell = cell
+        if not hasattr(self.cell, "call"):
+            # for non-dygraph mode, `rnn` api uses cell.call
+            self.cell.call = self.cell.forward
+        self.is_reverse = is_reverse
+        self.time_major = time_major
+
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        final_outputs, final_states = rnn(self.cell,
+                                          inputs,
+                                          initial_states=initial_states,
+                                          sequence_length=sequence_length,
+                                          time_major=self.time_major,
+                                          is_reverse=self.is_reverse)
+        return final_outputs, final_states
+
+
+class BiRNN(LayerMixin):
+    def __init__(self, cell_fw, cell_bw, time_major=False):
+        super(BiRNN, self).__init__()
+        self.cell_fw = cell_fw
+        self.cell_bw = cell_bw
+        self.time_major = time_major
+
+    def forward(self,
+                inputs,
+                initial_states=None,
+                sequence_length=None,
+                **kwargs):
+        if isinstance(initial_states, (list, tuple)):
+            assert len(initial_states) == 2, \
+                "length of initial_states should be 2 when it is a list/tuple"
+        else:
+            initial_states = [initial_states, initial_states]
+
+        outputs, final_states = birnn(self.cell_fw, self.cell_bw, inputs,
+                                      initial_states, sequence_length,
+                                      self.time_major)
+        return outputs, final_states
+
+
+class RNNMixin(LayerListMixin):
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        batch_index = 1 if self.time_major else 0
+        batch_size = inputs.shape[batch_index]
+        dtype = inputs.dtype
+        if initial_states is None:
+            state_shape = (self.num_layers * self.num_directions, batch_size,
+                           self.hidden_size)
+            if self.state_components == 1:
+                initial_states = np.zeros(state_shape, dtype)
+            else:
+                initial_states = tuple([
+                    np.zeros(state_shape, dtype)
+                    for _ in range(self.state_components)
+                ])
+
+        states = split_states(initial_states, self.num_directions == 2,
+                              self.state_components)
+        final_states = []
+
+        for i, rnn_layer in enumerate(self):
+            if i > 0:
+                inputs = dropout(inputs, self.dropout)
+            outputs, final_state = rnn_layer(inputs, states[i], sequence_length)
+            final_states.append(final_state)
+            inputs = outputs
+
+        final_states = concat_states(final_states, self.num_directions == 2,
+                                     self.state_components)
+        return outputs, final_states
+
+
+class SimpleRNN(RNNMixin):
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 nonlinearity="tanh",
+                 direction="forward",
+                 dropout=0.,
+                 time_major=False):
+        super(SimpleRNN, self).__init__()
+
+        if direction in ["forward", "backward"]:
+            is_reverse = direction == "backward"
+            cell = SimpleRNNCell(input_size, hidden_size, nonlinearity)
+            self.append(RNN(cell, is_reverse, time_major))
+            for i in range(1, num_layers):
+                cell = SimpleRNNCell(hidden_size, hidden_size, nonlinearity)
+                self.append(RNN(cell, is_reverse, time_major))
+        elif direction == "bidirectional":
+            cell_fw = SimpleRNNCell(input_size, hidden_size, nonlinearity)
+            cell_bw = SimpleRNNCell(input_size, hidden_size, nonlinearity)
+            self.append(BiRNN(cell_fw, cell_bw, time_major))
+            for i in range(1, num_layers):
+                cell_fw = SimpleRNNCell(2 * hidden_size, hidden_size,
+                                        nonlinearity)
+                cell_bw = SimpleRNNCell(2 * hidden_size, hidden_size,
+                                        nonlinearity)
+                self.append(BiRNN(cell_fw, cell_bw, time_major))
+        else:
+            raise ValueError(
+                "direction should be forward, backward or bidirectional, "
+                "received direction = {}".format(direction))
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.time_major = time_major
+        self.num_layers = num_layers
+        self.state_components = 1
+
+
+class LSTM(RNNMixin):
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 direction="forward",
+                 dropout=0.,
+                 time_major=False):
+        super(LSTM, self).__init__()
+
+        if direction in ["forward", "backward"]:
+            is_reverse = direction == "backward"
+            cell = LSTMCell(input_size, hidden_size)
+            self.append(RNN(cell, is_reverse, time_major))
+            for i in range(1, num_layers):
+                cell = LSTMCell(hidden_size, hidden_size)
+                self.append(RNN(cell, is_reverse, time_major))
+        elif direction == "bidirectional":
+            cell_fw = LSTMCell(input_size, hidden_size)
+            cell_bw = LSTMCell(input_size, hidden_size)
+            self.append(BiRNN(cell_fw, cell_bw, time_major))
+            for i in range(1, num_layers):
+                cell_fw = LSTMCell(2 * hidden_size, hidden_size)
+                cell_bw = LSTMCell(2 * hidden_size, hidden_size)
+                self.append(BiRNN(cell_fw, cell_bw, time_major))
+        else:
+            raise ValueError(
+                "direction should be forward, backward or bidirectional, "
+                "received direction = {}".format(direction))
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.time_major = time_major
+        self.num_layers = num_layers
+        self.state_components = 2
+
+
+class GRU(RNNMixin):
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 direction="forward",
+                 dropout=0.,
+                 time_major=False):
+        super(GRU, self).__init__()
+
+        if direction in ["forward", "backward"]:
+            is_reverse = direction == "backward"
+            cell = GRUCell(input_size, hidden_size)
+            self.append(RNN(cell, is_reverse, time_major))
+            for i in range(1, num_layers):
+                cell = GRUCell(hidden_size, hidden_size)
+                self.append(RNN(cell, is_reverse, time_major))
+        elif direction == "bidirectional":
+            cell_fw = GRUCell(input_size, hidden_size)
+            cell_bw = GRUCell(input_size, hidden_size)
+            self.append(BiRNN(cell_fw, cell_bw, time_major))
+            for i in range(1, num_layers):
+                cell_fw = GRUCell(2 * hidden_size, hidden_size)
+                cell_bw = GRUCell(2 * hidden_size, hidden_size)
+                self.append(BiRNN(cell_fw, cell_bw, time_major))
+        else:
+            raise ValueError(
+                "direction should be forward, backward or bidirectional, "
+                "received direction = {}".format(direction))
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.time_major = time_major
+        self.num_layers = num_layers
+        self.state_components = 1
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py
new file mode 100644
index 0000000000000..78f4bbab3b354
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+paddle.framework.set_default_dtype("float64")
+
+import numpy as np
+import unittest
+
+from rnn_numpy import SimpleRNNCell, LSTMCell, GRUCell
+from convert import convert_params_for_cell
+
+
+class TestSimpleRNNCell(unittest.TestCase):
+    def __init__(self, bias=True, place="cpu"):
+        super(TestSimpleRNNCell, self).__init__(methodName="runTest")
+        self.bias = bias
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        paddle.disable_static(self.place)
+        rnn1 = SimpleRNNCell(16, 32, bias=self.bias)
+        rnn2 = paddle.nn.SimpleRNNCell(
+            16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias)
+        convert_params_for_cell(rnn1, rnn2)
+
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+    def test_with_initial_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(4, 16)
+        prev_h = np.random.randn(4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+        y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h))
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(4, 16)
+
+        y1, h1 = rnn1(x)
+        y2, h2 = rnn2(paddle.to_variable(x))
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+
+
+class TestGRUCell(unittest.TestCase):
+    def __init__(self, bias=True, place="cpu"):
+        super(TestGRUCell, self).__init__(methodName="runTest")
+        self.bias = bias
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        paddle.disable_static(self.place)
+        rnn1 = GRUCell(16, 32, bias=self.bias)
+        rnn2 = paddle.nn.GRUCell(
+            16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias)
+        convert_params_for_cell(rnn1, rnn2)
+
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+    def test_with_initial_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(4, 16)
+        prev_h = np.random.randn(4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+        y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h))
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(4, 16)
+
+        y1, h1 = rnn1(x)
+        y2, h2 = rnn2(paddle.to_variable(x))
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+
+
+class TestLSTMCell(unittest.TestCase):
+    def __init__(self, bias=True, place="cpu"):
+        super(TestLSTMCell, self).__init__(methodName="runTest")
+        self.bias = bias
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        rnn1 = LSTMCell(16, 32, bias=self.bias)
+        rnn2 = paddle.nn.LSTMCell(
+            16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias)
+        convert_params_for_cell(rnn1, rnn2)
+
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+    def test_with_initial_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(4, 16)
+        prev_h = np.random.randn(4, 32)
+        prev_c = np.random.randn(4, 32)
+
+        y1, (h1, c1) = rnn1(x, (prev_h, prev_c))
+        y2, (h2, c2) = rnn2(
+            paddle.to_variable(x),
+            (paddle.to_variable(prev_h), paddle.to_variable(prev_c)))
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(4, 16)
+
+        y1, (h1, c1) = rnn1(x)
+        y2, (h2, c2) = rnn2(paddle.to_variable(x))
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+
+
+def load_tests(loader, tests, pattern):
+    suite = unittest.TestSuite()
+    for bias in [True, False]:
+        for device in ["cpu", "gpu"]:
+            for test_class in [TestSimpleRNNCell, TestGRUCell, TestLSTMCell]:
+                suite.addTest(test_class(bias, device))
+    return suite
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py
new file mode 100644
index 0000000000000..c371e4eff92e7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py
@@ -0,0 +1,327 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+torch.set_default_dtype(torch.float64)
+
+import paddle
+paddle.framework.set_default_dtype("float64")
+
+import numpy as np
+import unittest
+
+from convert import convert_params_for_cell_static
+from rnn_numpy import SimpleRNNCell, LSTMCell, GRUCell
+
+
+class TestSimpleRNNCell(unittest.TestCase):
+    def __init__(self, bias=True, place="cpu"):
+        super(TestSimpleRNNCell, self).__init__(methodName="runTest")
+        self.bias = bias
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        rnn1 = SimpleRNNCell(16, 32, bias=self.bias)
+
+        mp = paddle.static.Program()
+        sp = paddle.static.Program()
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                rnn2 = paddle.nn.SimpleRNNCell(
+                    16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias)
+
+        place = self.place
+        exe = paddle.static.Executor(place)
+        scope = paddle.fluid.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(sp)
+            convert_params_for_cell_static(rnn1, rnn2, place)
+
+        self.mp = mp
+        self.sp = sp
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+        self.executor = exe
+        self.scope = scope
+
+    def test_with_initial_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(4, 16)
+        prev_h = np.random.randn(4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                init_h = paddle.data(
+                    "init_h", [-1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data, init_h)
+
+        feed_dict = {x_data.name: x, init_h.name: prev_h}
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(4, 16)
+
+        y1, h1 = rnn1(x)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data)
+
+        feed_dict = {x_data.name: x}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp,
+                             feed=feed_dict,
+                             fetch_list=[y, h],
+                             use_prune=True)
+
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+
+
+class TestGRUCell(unittest.TestCase):
+    def __init__(self, bias=True, place="cpu"):
+        super(TestGRUCell, self).__init__(methodName="runTest")
+        self.bias = bias
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        rnn1 = GRUCell(16, 32, bias=self.bias)
+
+        mp = paddle.static.Program()
+        sp = paddle.static.Program()
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                rnn2 = paddle.nn.GRUCell(
+                    16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias)
+
+        place = self.place
+        exe = paddle.static.Executor(place)
+        scope = paddle.fluid.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(sp)
+            convert_params_for_cell_static(rnn1, rnn2, place)
+
+        self.mp = mp
+        self.sp = sp
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+        self.place = place
+        self.executor = exe
+        self.scope = scope
+
+    def test_with_initial_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(4, 16)
+        prev_h = np.random.randn(4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                init_h = paddle.data(
+                    "init_h", [-1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data, init_h)
+
+        feed_dict = {x_data.name: x, init_h.name: prev_h}
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(4, 16)
+
+        y1, h1 = rnn1(x)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data)
+
+        feed_dict = {x_data.name: x}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp,
+                             feed=feed_dict,
+                             fetch_list=[y, h],
+                             use_prune=True)
+
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+
+
+class TestLSTMCell(unittest.TestCase):
+    def __init__(self, bias=True, place="cpu"):
+        super(TestLSTMCell, self).__init__(methodName="runTest")
+        self.bias = bias
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        rnn1 = LSTMCell(16, 32, bias=self.bias)
+
+        mp = paddle.static.Program()
+        sp = paddle.static.Program()
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                rnn2 = paddle.nn.LSTMCell(
+                    16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias)
+
+        place = self.place
+        exe = paddle.static.Executor(place)
+        scope = paddle.fluid.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(sp)
+            convert_params_for_cell_static(rnn1, rnn2, place)
+
+        self.mp = mp
+        self.sp = sp
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+        self.place = place
+        self.executor = exe
+        self.scope = scope
+
+    def test_with_initial_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(4, 16)
+        prev_h = np.random.randn(4, 32)
+        prev_c = np.random.randn(4, 32)
+
+        y1, (h1, c1) = rnn1(x, (prev_h, prev_c))
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                init_h = paddle.data(
+                    "init_h", [-1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                init_c = paddle.data(
+                    "init_c", [-1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                y, (h, c) = rnn2(x_data, (init_h, init_c))
+
+        feed_dict = {x_data.name: x, init_h.name: prev_h, init_c.name: prev_c}
+        with paddle.static.scope_guard(scope):
+            y2, h2, c2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h, c])
+
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2, atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(4, 16)
+
+        y1, (h1, c1) = rnn1(x)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                y, (h, c) = rnn2(x_data)
+
+        feed_dict = {x_data.name: x}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2, c2 = exe.run(mp,
+                                 feed=feed_dict,
+                                 fetch_list=[y, h, c],
+                                 use_prune=True)
+
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2, atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+
+
+def load_tests(loader, tests, pattern):
+    suite = unittest.TestSuite()
+    for bias in [True, False]:
+        for device in ["cpu", "gpu"]:
+            for test_class in [TestSimpleRNNCell, TestLSTMCell, TestGRUCell]:
+                suite.addTest(test_class(bias, device))
+    return suite
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
new file mode 100644
index 0000000000000..16c790000e862
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
@@ -0,0 +1,269 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+import paddle
+paddle.set_default_dtype("float64")
+from paddle.fluid.layers import sequence_mask
+
+import numpy as np
+import unittest
+
+from convert import convert_params_for_net
+from rnn_numpy import SimpleRNN, LSTM, GRU
+
+
+class TestSimpleRNN(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super(TestSimpleRNN, self).__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        paddle.disable_static(self.place)
+        rnn1 = SimpleRNN(
+            16, 32, 2, time_major=self.time_major, direction=self.direction)
+        rnn2 = paddle.nn.SimpleRNN(
+            16, 32, 2, time_major=self.time_major, direction=self.direction)
+        convert_params_for_net(rnn1, rnn2)
+
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+    def test_with_initial_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        prev_h = np.random.randn(2 * self.num_directions, 4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+        y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h))
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+
+        y1, h1 = rnn1(x)
+        y2, h2 = rnn2(paddle.to_variable(x))
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_input_lengths(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
+
+        y1, h1 = rnn1(x, sequence_length=sequence_length)
+
+        seq_len = paddle.to_variable(sequence_length)
+        mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
+        if self.time_major:
+            mask = paddle.transpose(mask, [1, 0])
+        y2, h2 = rnn2(paddle.to_variable(x), sequence_length=seq_len)
+        y2 = paddle.multiply(y2, mask, axis=0)
+
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+        self.test_with_input_lengths()
+
+
+class TestGRU(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super(TestGRU, self).__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        paddle.disable_static(self.place)
+        rnn1 = GRU(16,
+                   32,
+                   2,
+                   time_major=self.time_major,
+                   direction=self.direction)
+        rnn2 = paddle.nn.GRU(16,
+                             32,
+                             2,
+                             time_major=self.time_major,
+                             direction=self.direction)
+        convert_params_for_net(rnn1, rnn2)
+
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+    def test_with_initial_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        prev_h = np.random.randn(2 * self.num_directions, 4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+        y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h))
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+
+        y1, h1 = rnn1(x)
+        y2, h2 = rnn2(paddle.to_variable(x))
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_input_lengths(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
+
+        y1, h1 = rnn1(x, sequence_length=sequence_length)
+
+        seq_len = paddle.to_variable(sequence_length)
+        mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
+        if self.time_major:
+            mask = paddle.transpose(mask, [1, 0])
+        y2, h2 = rnn2(paddle.to_variable(x), sequence_length=seq_len)
+        y2 = paddle.multiply(y2, mask, axis=0)
+
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+        self.test_with_input_lengths()
+
+
+class TestLSTM(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super(TestLSTM, self).__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        paddle.disable_static(self.place)
+        rnn1 = LSTM(
+            16, 32, 2, time_major=self.time_major, direction=self.direction)
+        rnn2 = paddle.nn.LSTM(
+            16, 32, 2, time_major=self.time_major, direction=self.direction)
+        convert_params_for_net(rnn1, rnn2)
+
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+    def test_with_initial_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        prev_h = np.random.randn(2 * self.num_directions, 4, 32)
+        prev_c = np.random.randn(2 * self.num_directions, 4, 32)
+
+        y1, (h1, c1) = rnn1(x, (prev_h, prev_c))
+        y2, (h2, c2) = rnn2(
+            paddle.to_variable(x),
+            (paddle.to_variable(prev_h), paddle.to_variable(prev_c)))
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+
+        y1, (h1, c1) = rnn1(x)
+        y2, (h2, c2) = rnn2(paddle.to_variable(x))
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def test_with_input_lengths(self):
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
+
+        y1, (h1, c1) = rnn1(x, sequence_length=sequence_length)
+
+        seq_len = paddle.to_variable(sequence_length)
+        mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
+        if self.time_major:
+            mask = paddle.transpose(mask, [1, 0])
+        y2, (h2, c2) = rnn2(paddle.to_variable(x), sequence_length=seq_len)
+        y2 = paddle.multiply(y2, mask, axis=0)
+
+        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+        self.test_with_input_lengths()
+
+
+def load_tests(loader, tests, pattern):
+    suite = unittest.TestSuite()
+    for direction in ["forward", "backward", "bidirectional"]:
+        for time_major in [True, False]:
+            for device in ["cpu", "gpu"]:
+                for test_class in [TestSimpleRNN, TestLSTM, TestGRU]:
+                    suite.addTest(test_class(time_major, direction, device))
+    return suite
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py
new file mode 100644
index 0000000000000..3620768262b5c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py
@@ -0,0 +1,468 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+paddle.set_default_dtype("float64")
+from paddle.fluid.layers import sequence_mask
+
+import numpy as np
+import unittest
+
+from convert import convert_params_for_net_static
+from rnn_numpy import SimpleRNN, LSTM, GRU
+
+
+class TestSimpleRNN(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super(TestSimpleRNN, self).__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        rnn1 = SimpleRNN(
+            16, 32, 2, time_major=self.time_major, direction=self.direction)
+
+        mp = paddle.static.Program()
+        sp = paddle.static.Program()
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                rnn2 = paddle.nn.SimpleRNN(
+                    16,
+                    32,
+                    2,
+                    time_major=self.time_major,
+                    direction=self.direction)
+
+        place = self.place
+        exe = paddle.static.Executor(place)
+        scope = paddle.fluid.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(sp)
+            convert_params_for_net_static(rnn1, rnn2, place)
+
+        self.mp = mp
+        self.sp = sp
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+        self.place = place
+        self.executor = exe
+        self.scope = scope
+
+    def test_with_initial_state(self):
+        mp = self.mp.clone().clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        prev_h = np.random.randn(2 * self.num_directions, 4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                init_h = paddle.data(
+                    "init_h", [2 * self.num_directions, -1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data, init_h)
+
+        feed_dict = {x_data.name: x, init_h.name: prev_h}
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+
+        y1, h1 = rnn1(x)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data)
+
+        feed_dict = {x_data.name: x}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def test_with_input_lengths(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
+
+        y1, h1 = rnn1(x, sequence_length=sequence_length)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                seq_len = paddle.data("seq_len", [-1], dtype="int64")
+                mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
+                if self.time_major:
+                    mask = paddle.transpose(mask, [1, 0])
+                y, h = rnn2(x_data, sequence_length=seq_len)
+                y = paddle.multiply(y, mask, axis=0)
+
+        feed_dict = {x_data.name: x, seq_len.name: sequence_length}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+        self.test_with_input_lengths()
+
+
+class TestGRU(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super(TestGRU, self).__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        rnn1 = GRU(16,
+                   32,
+                   2,
+                   time_major=self.time_major,
+                   direction=self.direction)
+
+        mp = paddle.static.Program()
+        sp = paddle.static.Program()
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                rnn2 = paddle.nn.GRU(16,
+                                     32,
+                                     2,
+                                     time_major=self.time_major,
+                                     direction=self.direction)
+
+        place = self.place
+        exe = paddle.static.Executor(place)
+        scope = paddle.fluid.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(sp)
+            convert_params_for_net_static(rnn1, rnn2, place)
+
+        self.mp = mp
+        self.sp = sp
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+        self.place = place
+        self.executor = exe
+        self.scope = scope
+
+    def test_with_initial_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+
+        prev_h = np.random.randn(2 * self.num_directions, 4, 32)
+
+        y1, h1 = rnn1(x, prev_h)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                init_h = paddle.data(
+                    "init_h", [2 * self.num_directions, -1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data, init_h)
+
+        feed_dict = {x_data.name: x, init_h.name: prev_h}
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+
+        y1, h1 = rnn1(x)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                y, h = rnn2(x_data)
+
+        feed_dict = {x_data.name: x}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def test_with_input_lengths(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
+
+        y1, h1 = rnn1(x, sequence_length=sequence_length)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                seq_len = paddle.data("seq_len", [-1], dtype="int64")
+                mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
+                if self.time_major:
+                    mask = paddle.transpose(mask, [1, 0])
+                y, h = rnn2(x_data, sequence_length=seq_len)
+                y = paddle.multiply(y, mask, axis=0)
+
+        feed_dict = {x_data.name: x, seq_len.name: sequence_length}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+
+
+class TestLSTM(unittest.TestCase):
+    def __init__(self, time_major=True, direction="forward", place="cpu"):
+        super(TestLSTM, self).__init__("runTest")
+        self.time_major = time_major
+        self.direction = direction
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.place = paddle.CPUPlace() if place == "cpu" \
+            else paddle.CUDAPlace(0)
+
+    def setUp(self):
+        rnn1 = LSTM(
+            16, 32, 2, time_major=self.time_major, direction=self.direction)
+
+        mp = paddle.static.Program()
+        sp = paddle.static.Program()
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                rnn2 = paddle.nn.LSTM(
+                    16,
+                    32,
+                    2,
+                    time_major=self.time_major,
+                    direction=self.direction)
+
+        place = self.place
+        exe = paddle.static.Executor(place)
+        scope = paddle.fluid.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(sp)
+            convert_params_for_net_static(rnn1, rnn2, place)
+
+        self.mp = mp
+        self.sp = sp
+        self.rnn1 = rnn1
+        self.rnn2 = rnn2
+
+        self.place = place
+        self.executor = exe
+        self.scope = scope
+
+    def test_with_initial_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        prev_h = np.random.randn(2 * self.num_directions, 4, 32)
+        prev_c = np.random.randn(2 * self.num_directions, 4, 32)
+
+        y1, (h1, c1) = rnn1(x, (prev_h, prev_c))
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                init_h = paddle.data(
+                    "init_h", [2 * self.num_directions, -1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                init_c = paddle.data(
+                    "init_c", [2 * self.num_directions, -1, 32],
+                    dtype=paddle.framework.get_default_dtype())
+                y, (h, c) = rnn2(x_data, (init_h, init_c))
+
+        feed_dict = {x_data.name: x, init_h.name: prev_h, init_c.name: prev_c}
+        with paddle.static.scope_guard(scope):
+            y2, h2, c2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h, c])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2, atol=1e-8, rtol=1e-5)
+
+    def test_with_zero_state(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+
+        y1, (h1, c1) = rnn1(x)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                y, (h, c) = rnn2(x_data)
+
+        feed_dict = {x_data.name: x}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2, c2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h, c])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2, atol=1e-8, rtol=1e-5)
+
+    def test_with_input_lengths(self):
+        mp = self.mp.clone()
+        sp = self.sp
+        rnn1 = self.rnn1
+        rnn2 = self.rnn2
+        exe = self.executor
+        scope = self.scope
+
+        x = np.random.randn(12, 4, 16)
+        if not self.time_major:
+            x = np.transpose(x, [1, 0, 2])
+        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)
+
+        y1, (h1, c1) = rnn1(x, sequence_length=sequence_length)
+
+        with paddle.fluid.unique_name.guard():
+            with paddle.static.program_guard(mp, sp):
+                x_data = paddle.data(
+                    "input", [-1, -1, 16],
+                    dtype=paddle.framework.get_default_dtype())
+                seq_len = paddle.data("seq_len", [-1], dtype="int64")
+                mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
+                if self.time_major:
+                    mask = paddle.transpose(mask, [1, 0])
+                y, (h, c) = rnn2(x_data, sequence_length=seq_len)
+                y = paddle.multiply(y, mask, axis=0)
+
+        feed_dict = {x_data.name: x, seq_len.name: sequence_length}
+
+        with paddle.static.scope_guard(scope):
+            y2, h2, c2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h, c])
+
+        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
+        np.testing.assert_allclose(c1, c2, atol=1e-8, rtol=1e-5)
+
+    def runTest(self):
+        self.test_with_initial_state()
+        self.test_with_zero_state()
+        self.test_with_input_lengths()
+
+
+def load_tests(loader, tests, pattern):
+    suite = unittest.TestSuite()
+    for direction in ["forward", "backward", "bidirectional"]:
+        for time_major in [True, False]:
+            for device in ["cpu", "gpu"]:
+                for test_class in [TestSimpleRNN, TestLSTM, TestGRU]:
+                    suite.addTest(test_class(time_major, direction, device))
+    return suite
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index 44710975edfd6..b15a4310dad8a 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -837,9 +837,7 @@ def forward(self, inputs, initial_states=None, sequence_length=None):
         for i, rnn_layer in enumerate(self):
             if i > 0:
                 inputs = F.dropout(
-                    inputs,
-                    self.dropout,
-                    dropout_implementation="upscale_in_train")
+                    inputs, self.dropout, mode="upscale_in_train")
             outputs, final_state = rnn_layer(inputs, states[i], sequence_length)
             final_states.append(final_state)
             inputs = outputs

From 14574d4407cc836155cbe20c5857ac904eebdfb5 Mon Sep 17 00:00:00 2001
From: chenfeiyu <chenfeiyu@baidu.com>
Date: Mon, 24 Aug 2020 20:04:54 +0800
Subject: [PATCH 06/14] disable gpu tests when paddle is not compiled with cuda
 support

---
 python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py   | 4 +++-
 .../fluid/tests/unittests/rnn/test_rnn_cells_static.py      | 6 ++++--
 python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py    | 4 +++-
 .../fluid/tests/unittests/rnn/test_rnn_nets_static.py       | 4 +++-
 4 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py
index 78f4bbab3b354..8d2677229a03f 100644
--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py
@@ -157,8 +157,10 @@ def runTest(self):
 
 def load_tests(loader, tests, pattern):
     suite = unittest.TestSuite()
+    devices = ["cpu", "gpu"] if paddle.fluid.is_compiled_with_cuda() \
+        else ["cpu"]
     for bias in [True, False]:
-        for device in ["cpu", "gpu"]:
+        for device in devices:
             for test_class in [TestSimpleRNNCell, TestGRUCell, TestLSTMCell]:
                 suite.addTest(test_class(bias, device))
     return suite
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py
index c371e4eff92e7..ede4c3ac189d4 100644
--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py
@@ -320,8 +320,10 @@ def runTest(self):
 
 def load_tests(loader, tests, pattern):
     suite = unittest.TestSuite()
+    devices = ["cpu", "gpu"] if paddle.fluid.is_compiled_with_cuda() \
+        else ["cpu"]
     for bias in [True, False]:
-        for device in ["cpu", "gpu"]:
-            for test_class in [TestSimpleRNNCell, TestLSTMCell, TestGRUCell]:
+        for device in devices:
+            for test_class in [TestSimpleRNNCell, TestGRUCell, TestLSTMCell]:
                 suite.addTest(test_class(bias, device))
     return suite
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
index 16c790000e862..53b69efb2b3c8 100644
--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
@@ -261,9 +261,11 @@ def runTest(self):
 
 def load_tests(loader, tests, pattern):
     suite = unittest.TestSuite()
+    devices = ["cpu", "gpu"] if paddle.fluid.is_compiled_with_cuda() \
+        else ["cpu"]
     for direction in ["forward", "backward", "bidirectional"]:
         for time_major in [True, False]:
-            for device in ["cpu", "gpu"]:
+            for device in devices:
                 for test_class in [TestSimpleRNN, TestLSTM, TestGRU]:
                     suite.addTest(test_class(time_major, direction, device))
     return suite
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py
index 3620768262b5c..90ed6b8b4c907 100644
--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py
@@ -460,9 +460,11 @@ def runTest(self):
 
 def load_tests(loader, tests, pattern):
     suite = unittest.TestSuite()
+    devices = ["cpu", "gpu"] if paddle.fluid.is_compiled_with_cuda() \
+        else ["cpu"]
     for direction in ["forward", "backward", "bidirectional"]:
         for time_major in [True, False]:
-            for device in ["cpu", "gpu"]:
+            for device in devices:
                 for test_class in [TestSimpleRNN, TestLSTM, TestGRU]:
                     suite.addTest(test_class(time_major, direction, device))
     return suite

From 779e2263610b0ca27b0c5421499a5a7b16baa275 Mon Sep 17 00:00:00 2001
From: chenfeiyu <chenfeiyu@baidu.com>
Date: Tue, 25 Aug 2020 09:39:57 +0800
Subject: [PATCH 07/14] remove unnecessary imports

---
 .../paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py  | 3 ---
 python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py       | 2 --
 2 files changed, 5 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py
index ede4c3ac189d4..948e47d5b9946 100644
--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py
@@ -12,9 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import torch
-torch.set_default_dtype(torch.float64)
-
 import paddle
 paddle.framework.set_default_dtype("float64")
 
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
index 53b69efb2b3c8..ef297b3bb6249 100644
--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import torch
-
 import paddle
 paddle.set_default_dtype("float64")
 from paddle.fluid.layers import sequence_mask

From 565ddb95445c9f00f15412d7631c67d3487b20d7 Mon Sep 17 00:00:00 2001
From: chenfeiyu <chenfeiyu@baidu.com>
Date: Tue, 25 Aug 2020 10:18:06 +0800
Subject: [PATCH 08/14] fix docstring

---
 python/paddle/nn/layer/rnn.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index b15a4310dad8a..c24db317622d7 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -288,7 +288,9 @@ class SimpleRNNCell(RNNCellBase):
             None). For more information, please refer to :ref:`api_guide_Name`.
 
     Examples:
+
         .. code-block:: python
+
             import paddle
             paddle.disable_static()
 
@@ -413,6 +415,7 @@ class LSTMCell(RNNCellBase):
             None). For more information, please refer to :ref:`api_guide_Name`.
 
     Examples:
+
         .. code-block:: python
 
             import paddle
@@ -546,6 +549,7 @@ class GRUCell(RNNCellBase):
             None). For more information, please refer to :ref:`api_guide_Name`.
 
     Examples:
+
         .. code-block:: python
 
             import paddle
@@ -688,6 +692,7 @@ class RNN(Layer):
             shape and dtype as the corresponding tensor in initial states.
 
     Examples:
+
         .. code-block:: python
 
             import paddle
@@ -767,6 +772,7 @@ class BiRNN(Layer):
                 cell and backward cell. 
 
     Examples:
+
         .. code-block:: python
 
             import paddle
@@ -911,6 +917,7 @@ class SimpleRNN(RNNMixin):
             `[num_lauers * num_directions, batch_size, hidden_size]`.
 
     Examples:
+
         .. code-block:: python
 
             import paddle
@@ -1047,6 +1054,7 @@ class LSTM(RNNMixin):
             the shape of each is `[num_lauers * num_directions, batch_size, hidden_size]`.
 
     Examples:
+    
         .. code-block:: python
 
             import paddle
@@ -1175,6 +1183,7 @@ class GRU(RNNMixin):
             `[num_lauers * num_directions, batch_size, hidden_size]`.
 
     Examples:
+
         .. code-block:: python
 
             import paddle

From 9569a55a5955ee6f10df63144001aee310e7efd6 Mon Sep 17 00:00:00 2001
From: chenfeiyu <chenfeiyu@baidu.com>
Date: Tue, 25 Aug 2020 10:43:38 +0800
Subject: [PATCH 09/14] add to no_sample wlist

---
 tools/wlist.json | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/tools/wlist.json b/tools/wlist.json
index 6a0360fbcd9d0..64949d7a28cc5 100644
--- a/tools/wlist.json
+++ b/tools/wlist.json
@@ -142,7 +142,20 @@
         "Callback.on_eval_batch_end",
         "Callback.on_test_batch_begin",
         "Callback.on_test_batch_end",
-        "Model.prepare"
+        "Model.prepare",
+        "SimpleRNNCell",
+        "SimpleRNNCell.forward",
+        "LSTMCell",
+        "LSTMCell.forward",
+        "GRUCell",
+        "GRUCell.forward",
+        "SimpleRNN",
+        "GRU",
+        "LSTM",
+        "RNN",
+        "BiRNN",
+        "RNNCellBase",
+        "RNNCellBase.get_initial_states"
     ],
     "wlist_no_op_pass":[
         "gelu",

From 07bde98e2efceb17ddef28e1b2a715a1b5626cad Mon Sep 17 00:00:00 2001
From: chenfeiyu <chenfeiyu@baidu.com>
Date: Tue, 25 Aug 2020 14:00:04 +0800
Subject: [PATCH 10/14] backport to python2 to avoid yield from

---
 python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py b/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py
index 725d7df2df3a5..7e0b8374b95cf 100644
--- a/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py
+++ b/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py
@@ -272,7 +272,8 @@ def flatten(nested):
 def _flatten(nested):
     for item in nested:
         if isinstance(item, (list, tuple)):
-            yield from _flatten(item)
+            for subitem in _flatten(item):
+                yield subitem
         else:
             yield item
 

From ed3d925272faf18fd4636edd9fb9bc8e953cb822 Mon Sep 17 00:00:00 2001
From: chenfeiyu <chenfeiyu@baidu.com>
Date: Wed, 26 Aug 2020 10:48:28 +0800
Subject: [PATCH 11/14] add **kwargs, fix typos

---
 python/paddle/fluid/layers/rnn.py |  90 +++---
 python/paddle/nn/layer/rnn.py     | 505 ++++++++++++++++++------------
 2 files changed, 344 insertions(+), 251 deletions(-)

diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index ae6539370f25f..d1b0e4961138c 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -38,6 +38,7 @@
     'Decoder',
     'BeamSearchDecoder',
     'rnn',
+    'birnn',
     'dynamic_decode',
     'DecodeHelper',
     'TrainingHelper',
@@ -444,33 +445,29 @@ def rnn(cell,
 
     Arguments:
         cell(RNNCellBase): An instance of `RNNCellBase`.
-        inputs(Tensor): A (possibly nested structure of) tensor[s]. 
-            The shape of tensor should be `[batch_size, sequence_length, ...]`
-            for `time_major == False` or `[sequence_length, batch_size, ...]`
-            for `time_major == True`. It represents the inputs to be unrolled
-            in RNN.
-        initial_states(Tensor, optional): A (possibly nested structure of)
-            tensor[s], representing the initial state for RNN. 
-            If not provided, `cell.get_initial_states` would be used to produce
-            the initial state. Default None.
-        sequence_length(Tensor, optional): A tensor with shape `[batch_size]`.
-            It stores real length of each instance, thus enables users to extract
-            the last valid state when past a batch element's sequence length for
-            correctness. If not provided, the paddings would be treated same as
-            non-padding inputs. Default None.
-        time_major(bool, optional): Indicate the data layout of Tensor included
-            in `input` and `output` tensors. If `False`, the data layout would
-            be batch major with shape `[batch_size, sequence_length, ...]`.  If
-            `True`, the data layout would be time major with shape
-            `[sequence_length, batch_size, ...]`. Default: `False`.
-        is_reverse(bool, optional): Indicate whether to calculate in the reverse
-            order of input sequences. Default: `False`.
-        **kwargs: Additional keyword arguments. Arguments passed to `cell.call`. 
+        inputs(Tensor): the input sequences. 
+            If time_major is True, the shape is 
+            `[time_steps, batch_size, input_size]`
+            else the shape is `[batch_size, time_steps, input_size]`.
+        initial_states(Tensor|tuple|list, optional): the initial state of the 
+            rnn cell. Tensor or a possibly nested structure of tensors. If not 
+            provided, `cell.get_initial_states` would be called to produce
+            the initial state. Defaults to None.
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
+            or int32. The valid lengths of input sequences. Defaults to None.
+            If `sequence_length` is not None, the inputs are treated as 
+            padded sequences. In each input sequence, elements whose time step 
+            index are not less than the valid length are treated as paddings.
+        time_major (bool): Whether the first dimension of the input means the
+            time steps. Defaults to False.
+        is_reverse (bool, optional): Indicate whether to calculate in the reverse
+            order of input sequences. Defaults to False.
+        **kwargs: Additional keyword arguments to pass to `forward` of the cell. 
 
     Returns:
         (outputs, final_states)
         outputs (Tensor|list|tuple): the output sequence. Tensor or nested 
-            structure of Tensor.
+            structure of Tensors.
             If `time_major` is True, the shape of each tensor in outpus is 
             `[time_steps, batch_size, hidden_size]`, else 
             `[batch_size, time_steps, hidden_size]`.
@@ -651,8 +648,8 @@ def _switch_grad(x, stop=False):
     return (final_outputs, final_states)
 
 
-def birnn(cell_fw, cell_bw, inputs, initial_states, sequence_length,
-          time_major):
+def birnn(cell_fw, cell_bw, inputs, initial_states, sequence_length, time_major,
+          **kwargs):
     """
     birnn creates a bidirectional recurrent neural network specified by 
     RNNCell `cell_fw` and `cell_bw`, which performs :code:`cell.call()` 
@@ -668,28 +665,25 @@ def birnn(cell_fw, cell_bw, inputs, initial_states, sequence_length,
             for `time_major == True`. It represents the inputs to be unrolled
             in RNN.
         initial_states(tuple, optional): A tuple of 
-            If not provided, `cell.get_initial_states` would be used to produce
-            the each initial state. Defaults to None.
-        sequence_length(Tensor, optional): A tensor with shape `[batch_size]`.
-            It stores real length of each instance, thus enables users to extract
-            the last valid state when past a batch element's sequence length for
-            correctness. If not provided, the paddings would be treated same as
-            non-padding inputs. Default None.
-        time_major(bool, optional): Indicate the data layout of Tensor included
-            in `input` and `output` tensors. If `False`, the data layout would
-            be batch major with shape `[batch_size, time_steps, ...]`.  If
-            `True`, the data layout would be time major with shape
-            `[time_steps, batch_size, ...]`. Default: `False`.
-        **kwargs: Additional keyword arguments. Arguments passed to `cell.call`. 
+            If not provided, `cell.get_initial_states` would be called to 
+            produce initial state for each cell. Defaults to None.
+        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
+            or int32. The valid lengths of input sequences. Defaults to None.
+            If `sequence_length` is not None, the inputs are treated as 
+            padded sequences. In each input sequence, elements whose time step 
+            index are not less than the valid length are treated as paddings.
+        time_major (bool): Whether the first dimension of the input means the
+            time steps. Defaults to False.
+        **kwargs: Additional keyword arguments to pass to `forward` of each cell. 
 
     Returns:
-        outputs (Tensor): A (possibly nested structure of) tensor variable[s],
-            the outputs of the bidirectional RNN. It is the concatenation 
-            of the outputs for both the forward RNN and backward RNN along
-            the last axis. 
-            The shape of tensor should be `[batch_size, time_steps, ...]`
-            for `time_major == False` or `[time_steps, batch_size, ...]`
-            for `time_major == True`.
+        (outputs, final_states)
+        outputs (Tensor): the outputs of the bidirectional RNN. It is the 
+            concatenation of the outputs from the forward RNN and backward 
+            RNN along the last axis. 
+            If time major is True, the shape is `[time_steps, batch_size, size]`,
+            else the shape is `[batch_size, time_steps, size]`, where size is
+            `cell_fw.hidden_size + cell_bw.hidden_size`.
         final_states (tuple): A tuple of the final states of the forward 
             cell and backward cell. 
             
@@ -712,14 +706,16 @@ def birnn(cell_fw, cell_bw, inputs, initial_states, sequence_length,
                                 inputs,
                                 states_fw,
                                 sequence_length,
-                                time_major=time_major)
+                                time_major=time_major,
+                                **kwargs)
 
     outputs_bw, states_bw = rnn(cell_bw,
                                 inputs,
                                 states_bw,
                                 sequence_length,
                                 time_major=time_major,
-                                is_reverse=True)
+                                is_reverse=True,
+                                **kwargs)
 
     outputs = map_structure(lambda x, y: tensor.concat([x, y], -1), outputs_fw,
                             outputs_bw)
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index c24db317622d7..2f5756459709a 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -149,30 +149,35 @@ def get_initial_states(self,
                            batch_ref,
                            shape=None,
                            dtype=None,
-                           init_value=0,
+                           init_value=0.,
                            batch_dim_idx=0):
         r"""
         Generate initialized states according to provided shape, data type and
         value.
         Arguments:
-            batch_ref: A (possibly nested structure of) tensor variable[s].
-                The first dimension of the tensor will be used as batch size to
-                initialize states.
-            shape: A (possibly nested structure of) shape[s], where a shape is
-                represented as a list/tuple of integer). -1(for batch size) will
-                beautomatically inserted if shape is not started with it. If None,
-                property `state_shape` will be used. The default value is None.
-            dtype: A (possibly nested structure of) data type[s]. The structure
-                must be same as that of `shape`, except when all tensors' in states
-                has the same data type, a single data type can be used. If None and
-                property `cell.state_shape` is not available, float32 will be used
-                as the data type. The default value is None.
-            init_value: A float value used to initialize states.
-            batch_dim_idx: An integer indicating which dimension of the tensor in
-                inputs represents batch size.  The default value is 0.
+            batch_ref (Tensor): A tensor, which shape would be used to 
+                determine the batch size, which is used to generate initial 
+                states. For `batch_ref`'s shape d, `d[batch_dim_idx]` is 
+                treated as batch size.
+            shape (list|tuple, optional): A (possibly nested structure of) shape[s], 
+                where a shape is a list/tuple of integer). `-1` (for batch size) 
+                will be automatically prepended if a shape does not starts with 
+                it. If None, property `state_shape` will be used. Defaults to 
+                None.
+            dtype (str|list|tuple, optional): A (possibly nested structure of) 
+                data type[s]. The structure must be same as that of `shape`, 
+                except when all tensors' in states has the same data type, a 
+                single data type can be used. If None and property `cell.state_shape` 
+                is not available, current default floating type of paddle is 
+                used. Defaults to None.
+            init_value (float, optional): A float value used to initialize states. 
+                Defaults to 0.
+            batch_dim_idx (int, optional): An integer indicating which 
+                dimension of the of `batch_ref` represents batch. Defaults to 0.
         Returns:
-            Variable: tensor variable[s] packed in the same structure provided \
-                by shape, representing the initialized states.
+            init_states (Tensor|tuple|list): tensor of the provided shape and 
+                dtype, or list of tensors that each satisfies the requirements,
+                packed in the same structure as `shape` and `type` does.
         """
         # TODO: use inputs and batch_size
         batch_ref = flatten(batch_ref)[0]
@@ -209,7 +214,7 @@ def __init__(self, shape):
         # nested structure of dtypes
         try:
             states_dtypes = self.state_dtype if dtype is None else dtype
-        except NotImplementedError:  # use fp32 as default
+        except NotImplementedError:
             states_dtypes = framework.get_default_dtype()
         if len(flatten(states_dtypes)) == 1:
             dtype = flatten(states_dtypes)[0]
@@ -229,8 +234,8 @@ def state_shape(self):
         r"""
         Abstract method (property).
         Used to initialize states.
-        A (possiblely nested structure of) shape[s], where a shape is represented
-        as a list/tuple of integers (-1 for batch size would be automatically
+        A (possiblely nested structure of) shape[s], where a shape is a 
+        list/tuple of integers (-1 for batch size would be automatically
         inserted into a shape if shape is not started with it).
         Not necessary to be implemented if states are not initialized by
         `get_initial_states` or the `shape` argument is provided when using
@@ -257,7 +262,8 @@ def state_dtype(self):
 
 class SimpleRNNCell(RNNCellBase):
     r"""
-    Elman RNN (SimpleRNN) cell.
+    Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it 
+    computes the outputs and updates states.
 
     The formula used is as follows:
 
@@ -274,9 +280,9 @@ class SimpleRNNCell(RNNCellBase):
     Arguments:
         input_size (int): The input size.
         hidden_size (int): The hidden size.
-        nonlinearity (str): The activation in the SimpleRNN cell. It can be 
-            `tanh` or `relu`. Defaults to `tanh`.
-        weight_ih_attr(ParamAttr, optional): The parameter attribute for 
+        nonlinearity (str, optional): The activation in the SimpleRNN cell. 
+            It can be `tanh` or `relu`. Defaults to `tanh`.
+        weight_ih_attr (ParamAttr, optional): The parameter attribute for 
             `weight_ih`. Default: None.
         weight_hh_attr(ParamAttr, optional): The parameter attribute for 
             `weight_hh`. Default: None.
@@ -287,6 +293,37 @@ class SimpleRNNCell(RNNCellBase):
         name (str, optional): Name for the operation (optional, default is 
             None). For more information, please refer to :ref:`api_guide_Name`.
 
+    Parameters:
+        weight_ih (Parameter): shape (hidden_size, input_size), input to hidden 
+            weight, corresponding to :math:`W_{ih}` in the formula.
+        weight_hh (Parameter): shape (hidden_size, hidden_size), hidden to 
+            hidden weight, corresponding to :math:`W_{hh}` in the formula.
+        bias_ih (Parameter): shape (hidden_size, ), input to hidden bias, 
+            corresponding to :math:`b_{ih}` in the formula.
+        bias_hh (Parameter): shape (hidden_size, ), hidden to hidden bias, 
+            corresponding to :math:`b_{hh}` in the formula.
+    
+    Inputs:
+        inputs (Tensor): shape `[batch_size, input_size]`, the input, 
+                corresponding to :math:`x_t` in the formula.
+        states (Tensor, optional): shape `[batch_size, hidden_size]`, the
+            previous hidden state, corresponding to :math:`h_{t-1}` in the 
+            formula. When states is None, zero state is used. Defaults to 
+            None.
+
+    Returns:
+        (outputs, new_states)
+        outputs (Tensor): shape `[batch_size, hidden_size]`, the output, 
+            corresponding to :math:`h_{t}` in the formula.
+        states (Tensor): shape `[batch_size, hidden_size]`, the new hidden 
+            state, corresponding to :math:`h_{t}` in the formula.
+    
+    Notes:
+        All the weights and bias are initialized with `Uniform(-std, std)` by 
+        default. Where std = :math:`\frac{1}{\sqrt{hidden_size}}`. For more 
+        information about parameter initialization, please refer to
+         :ref:`api_fluid_ParamAttr`.
+
     Examples:
 
         .. code-block:: python
@@ -344,24 +381,6 @@ def __init__(self,
             else F.relu
 
     def forward(self, inputs, states=None):
-        r"""
-        Given the input and previous atate, compute the output and update state.
-
-        Arguments:
-            inputs (Tensor): shape `[batch_size, input_size]`, the input, 
-                corresponding to :math:`x_t` in the formula.
-            states (Tensor, optional): shape `[batch_size, hidden_size]`, the
-                previous hidden state, corresponding to :math:`h_{t-1}` in the 
-                formula. When states is None, zero state is used. Defaults to 
-                None.
-        Returns:
-            (outputs, new_states)
-            outputs (Tensor): shape `[batch_size, hidden_size]`, the output, 
-                corresponding to :math:`h_{t}` in the formula.
-            states (Tensor): shape `[batch_size, hidden_size]`, the new hidden 
-                state, corresponding to :math:`h_{t}` in the formula.
-
-        """
         if states is None:
             states = self.get_initial_states(inputs, self.state_shape)
         pre_h = states
@@ -381,7 +400,8 @@ def state_shape(self):
 
 class LSTMCell(RNNCellBase):
     r"""
-    Long-Short Term Memory(LSTM) RNN cell.
+    Long-Short Term Memory(LSTM) RNN cell. Given the inputs and previous states, 
+    it computes the outputs and updates states.
 
     The formula used is as follows:
 
@@ -414,6 +434,42 @@ class LSTMCell(RNNCellBase):
         name (str, optional): Name for the operation (optional, default is 
             None). For more information, please refer to :ref:`api_guide_Name`.
 
+    Parameters:
+        weight_ih (Parameter): shape (4 * hidden_size, input_size), input to 
+            hidden weight, which corresponds to the concatenation of
+             :math:`W_{ii}, W_{if}, W_{ig}, W_{io}` in the formula.
+        weight_hh (Parameter): shape (4 * hidden_size, hidden_size), hidden to 
+            hidden weight, which corresponds to the concatenation of
+             :math:`W_{hi}, W_{hf}, W_{hg}, W_{ho}` in the formula.
+        bias_ih (Parameter): shape (4 * hidden_size, ), input to hidden bias, 
+            which corresponds to the concatenation of
+             :math:`b_{ii}, b_{if}, b_{ig}, b_{io}` in the formula.
+        bias_hh (Parameter): shape (4 * hidden_size, ), hidden to hidden bias, 
+            which corresponds to the concatenation of
+             :math:`b_{hi}, b_{hf}, b_{hg}, b_{ho}` in the formula.
+
+    Inputs:
+        inputs (Tensor): shape `[batch_size, input_size]`, the input, 
+            corresponding to :math:`x_t` in the formula.
+        states (tuple, optional): a tuple of two tensors, each of shape 
+            `[batch_size, hidden_size]`, the previous hidden state, 
+            corresponding to :math:`h_{t-1}, c_{t-1}` in the formula. 
+            When states is None, zero state is used. Defaults to None.
+
+    Returns:
+        (outputs, new_states)
+        outputs (Tensor): shape `[batch_size, hidden_size]`, the output, 
+            corresponding to :math:`h_{t}` in the formula.
+        states (tuple): a tuple of two tensors, each of shape 
+            `[batch_size, hidden_size]`, the new hidden states,
+            corresponding to :math:`h_{t}, c{t}` in the formula.
+
+    Notes:
+        All the weights and bias are initialized with `Uniform(-std, std)` by 
+        default. Where std = :math:`\frac{1}{\sqrt{hidden_size}}`. For more 
+        information about parameter initialization, please refer to
+         :ref:`api_fluid_ParamAttr`.
+
     Examples:
 
         .. code-block:: python
@@ -464,25 +520,6 @@ def __init__(self,
         self._activation = paddle.tanh
 
     def forward(self, inputs, states=None):
-        r"""
-        Given the input and previous atate, compute the output and update state.
-
-        Arguments:
-            inputs (Tensor): shape `[batch_size, input_size]`, the input, 
-                corresponding to :math:`x_t` in the formula.
-            states (tuple, optional): a tuple of two tensors, each of shape 
-                `[batch_size, hidden_size]`, the previous hidden state, 
-                corresponding to :math:`h_{t-1}, c_{t-1}` in the formula. 
-                When states is None, zero state is used. Defaults to None.
-        Returns:
-            (outputs, new_states)
-            outputs (Tensor): shape `[batch_size, hidden_size]`, the output, 
-                corresponding to :math:`h_{t}` in the formula.
-            states (tuple): a tuple of two tensors, each of shape 
-                `[batch_size, hidden_size]`, the new hidden states,
-                corresponding to :math:`h_{t}, c{t}` in the formula.
-
-        """
         if states is None:
             states = self.get_initial_states(inputs, self.state_shape)
         pre_hidden, pre_cell = states
@@ -516,7 +553,8 @@ def state_shape(self):
 
 class GRUCell(RNNCellBase):
     r"""
-    Gated Recurrent Unit (GRU) RNN cell.
+    Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states, 
+    it computes the outputs and updates states.
 
     The formula for GRU used is as follows:
 
@@ -548,6 +586,39 @@ class GRUCell(RNNCellBase):
         name (str, optional): Name for the operation (optional, default is 
             None). For more information, please refer to :ref:`api_guide_Name`.
 
+    Parameters:
+        weight_ih (Parameter): shape (3 * hidden_size, input_size), input to 
+            hidden weight, which corresponds to the concatenation of
+             :math:`W_{ir}, W_{iz}, W_{ic}` in the formula.
+        weight_hh (Parameter): shape (3 * hidden_size, hidden_size), hidden to 
+            hidden weight, which corresponds to the concatenation of
+             :math:`W_{hr}, W_{hz}, W_{hc}` in the formula.
+        bias_ih (Parameter): shape (3 * hidden_size, ), input to hidden bias, 
+            which corresponds to the concatenation of
+             :math:`b_{ir}, b_{iz}, b_{ic}` in the formula.
+        bias_hh (Parameter): shape (3 * hidden_size, ), hidden to hidden bias, 
+            which corresponds to the concatenation of
+             :math:`b_{hr}, b_{hz}, b_{hc}` in the formula.
+
+    Inputs:
+        inputs (Tensor): A tensor with shape `[batch_size, input_size]`,
+            corresponding to :math:`x_t` in the formula.
+        states (Tensor): A tensor with shape `[batch_size, hidden_size]`.
+            corresponding to :math:`h_{t-1}` in the formula.
+
+    Returns:
+        (outputs, new_states)
+        outputs (Tensor): shape `[batch_size, hidden_size]`, the output, 
+            corresponding to :math:`h_{t}` in the formula.
+        states (Tensor): shape `[batch_size, hidden_size]`, the new hidden 
+            state, corresponding to :math:`h_{t}` in the formula.
+    
+    Notes:
+        All the weights and bias are initialized with `Uniform(-std, std)` by 
+        default. Where std = :math:`\frac{1}{\sqrt{hidden_size}}`. For more 
+        information about parameter initialization, please refer to
+         :ref:`api_fluid_ParamAttr`.
+
     Examples:
 
         .. code-block:: python
@@ -598,23 +669,6 @@ def __init__(self,
         self._activation = paddle.tanh
 
     def forward(self, inputs, states=None):
-        r"""
-        Performs single step GRU calculations.
-
-        Parameters:
-            inputs (Variable): A tensor with shape `[batch_size, input_size]`,
-                corresponding to :math:`x_t` in the formula. The data type
-                should be float32 or float64.
-            states (Variable): A tensor with shape `[batch_size, hidden_size]`.
-                corresponding to :math:`h_{t-1}` in the formula. The data type
-                should be float32 or float64.
-
-        Returns:
-            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` and \
-                `new_states` is the same tensor shaped `[batch_size, hidden_size]`, \
-                corresponding to :math:`h_t` in the formula. The data type of the \
-                tensor is same as that of `states`.        
-        """
         if states is None:
             states = self.get_initial_states(inputs, self.state_shape)
 
@@ -641,55 +695,57 @@ def state_shape(self):
         r"""
         The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch
         size would be automatically inserted into shape). The shape corresponds
-        to :math:`h_{t-1}`.
+        to the shape of :math:`h_{t-1}`.
         """
         return (self.hidden_size, )
 
 
 class RNN(Layer):
     r"""
-    Wrapper for RNN, which creates a recurrent neural network specified with a
-    RNN cell. It performs :code:`cell.forward()` repeatedly until reaches to 
-    the maximum length of `inputs`.
+    Wrapper for RNN, which creates a recurrent neural network with an RNN cell. 
+    It performs :code:`cell.forward()` repeatedly until reaches to the maximum 
+    length of `inputs`.
 
     Arguments:
-        cell(RNNCellBase): An instance of `RNNCell`.
+        cell(RNNCellBase): An instance of `RNNCellBase`.
         is_reverse (bool, optional): Indicate whether to calculate in the reverse
             order of input sequences. Defaults to False.
-        time_major (bool, optional): Indicate the data layout of Tensor included
-            in `input` and `output` tensors. If `False`, the data layout would
-            be batch major with shape `[batch_size, time_steps, ...]`.  If
-            `True`, the data layout would be time major with shape
-            `[time_steps, batch_size, ...]`. Defaults to False.
+        time_major (bool): Whether the first dimension of the input means the
+            time steps. Defaults to False.
 
     Inputs:
-        inputs (Tensor): A (possibly nested structure of) tensor variable[s]. 
-            The shape of tensor should be `[batch_size, time_steps, ...]`
-            for `time_major == False` or `[time_steps, batch_size, ...]`
-            for `time_major == True`. It represents the inputs to be unrolled
-            in RNN.
-        initial_states (Tensor|list|tuple, optional): A (possibly nested structure of)
-            tensor[s], representing the initial state for the rnn cell. 
-            If not provided, `cell.get_initial_states` would be used to produce
-            the initial state. Defaults to None.
+        inputs (Tensor): A (possibly nested structure of) tensor[s]. The input 
+            sequences. 
+            If time major is True, the shape is `[batch_size, time_steps, input_size]`
+            If time major is False, the shape is [time_steps, batch_size, input_size]`
+            where `input_size` is the input size of the cell.
+        initial_states (Tensor|list|tuple, optional): Tensor of a possibly 
+            nested structure of tensors, representing the initial state for 
+            the rnn cell. If not provided, `cell.get_initial_states` would be 
+            called to produce the initial states. Defaults to None.
         sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
-            or int32. The valid lengths of input sequences.
+            or int32. The valid lengths of input sequences. Defaults to None.
             If `sequence_length` is not None, the inputs are treated as 
-            padded sequences. In each input sequence, elements whos time step 
+            padded sequences. In each input sequence, elements whose time step 
             index are not less than the valid length are treated as paddings.
-        **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`. 
+        **kwargs: Additional keyword arguments to pass to `forward` of the cell. 
 
-    Outputs:
+    Returns:
         (outputs, final_states)
-        outputs (Tensor|list|tuple): the output sequence. Tensor or nested 
-            structure of Tensor.
-            If `time_major` is True, the shape of each tensor in outpus is 
+        outputs (Tensor|list|tuple): the output sequences.
+            If `time_major` is True, the shape is 
             `[time_steps, batch_size, hidden_size]`, else 
             `[batch_size, time_steps, hidden_size]`.
-        final_states (Tensor|list|tuple): final states. A (possibly nested structure of)
-            tensor[s], representing the final state for RNN. It has the same 
-            structure of intial state. Each tensor in final states has the same
-            shape and dtype as the corresponding tensor in initial states.
+        final_states (Tensor|list|tuple): final states of the cell. Tensor or 
+            a possibly nested structure of tensors which has the same structure 
+            with intial state. Each tensor in final states has the same shape 
+            and dtype as the corresponding tensor in initial states.
+    
+    Notes:
+        This class is a low level API for wrapping rnn cell into a RNN network.
+        Users should take care of the state of the cell. If `initial_states` is 
+        passed to the `forward` method, make sure that it satisfies the 
+        requirements of the cell.
 
     Examples:
 
@@ -716,7 +772,11 @@ def __init__(self, cell, is_reverse=False, time_major=False):
         self.is_reverse = is_reverse
         self.time_major = time_major
 
-    def forward(self, inputs, initial_states=None, sequence_length=None):
+    def forward(self,
+                inputs,
+                initial_states=None,
+                sequence_length=None,
+                **kwargs):
         if initial_states is None:
             initial_states = self.cell.get_initial_states(
                 batch_ref=inputs,
@@ -728,48 +788,58 @@ def forward(self, inputs, initial_states=None, sequence_length=None):
                                             initial_states=initial_states,
                                             sequence_length=sequence_length,
                                             time_major=self.time_major,
-                                            is_reverse=self.is_reverse)
+                                            is_reverse=self.is_reverse,
+                                            **kwargs)
         return final_outputs, final_states
 
 
 class BiRNN(Layer):
     r"""
-    Wrapper for bidirectional RNN. It assembles two RNN cells by performing
-    forward and backward RNN separately, and concat outputs.
+    Wrapper for bidirectional RNN, which builds a bidiretional RNN given the 
+    forward rnn cell and backward rnn cell. A BiRNN applies forward RNN and 
+    backward RNN with coresponding cells separately and concats the outputs 
+    along the last axis.
 
-    Parameters:
-        cell_fw (RNNCellBase): A RNNCell instance used for forward RNN.
-        cell_bw (RNNCellBase): A RNNCell instance used for backward RNN.
+    Arguments:
+        cell_fw (RNNCellBase): A RNNCellBase instance used for forward RNN.
+        cell_bw (RNNCellBase): A RNNCellBase instance used for backward RNN.
         time_major (bool): Whether the first dimension of the input means the
-            time steps.
+            time steps. Defaults to False.
 
     Inputs:
-        inputs (Tensor): A (possibly nested structure of) tensor variable[s]. 
-            The shape of tensor should be `[batch_size, sequence_length, ...]`
-            for `time_major == False` or `[sequence_length, batch_size, ...]`
-            for `time_major == True`. It represents the inputs to be unrolled
-            in both forward and backward RNN.
-        initial_states (list|tuple, optional): A tuple of the initial states of 
-            the forward cell and backward cell. 
-            If not provided, `cell.get_initial_states` would be used to produce 
-            the initial states. Defaults to None.
+        inputs (Tensor): the input sequences of both RNN. 
+            If time_major is True, the shape of is 
+            `[time_steps, batch_size, input_size]`, else the shape is
+            `[batch_size, time_steps, input_size]`, where input_size is the 
+            input size of both cells.
+        initial_states (list|tuple, optional): A tuple/list of the initial 
+            states of the forward cell and backward cell. Defaults to None.
+            If not provided, `cell.get_initial_states` would be called to 
+            produce the initial states for each cell. Defaults to None.
         sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
-            or int32. The valid lengths of input sequences.
+            or int32. The valid lengths of input sequences. Defaults to None.
             If `sequence_length` is not None, the inputs are treated as 
-            padded sequences. In each input sequence, elements whos time step 
+            padded sequences. In each input sequence, elements whose time step 
             index are not less than the valid length are treated as paddings.
-        **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`.
+        **kwargs: Additional keyword arguments. Arguments passed to `forward` 
+            for each cell.
 
     Outputs:
-            outputs (Tensor): A (possibly nested structure of) tensor variable[s],
-                the outputs of the bidirectional RNN. It is the concatenation 
-                of the outputs for both the forward RNN and backward RNN along
-                the last axis. 
-                The shape of tensor should be `[batch_size, time_steps, ...]`
-                for `time_major == False` or `[time_steps, batch_size, ...]`
-                for `time_major == True`.
-            final_states (tuple): A tuple of the final states of the forward 
-                cell and backward cell. 
+        (outputs, final_states)
+        outputs (Tensor): the outputs of the bidirectional RNN. It is the 
+            concatenation of the outputs from the forward RNN and backward 
+            RNN along the last axis. 
+            If time major is True, the shape is `[time_steps, batch_size, size]`,
+            else the shape is `[batch_size, time_steps, size]`, where size is
+            `cell_fw.hidden_size + cell_bw.hidden_size`.
+        final_states (tuple): A tuple of the final states of the forward 
+            cell and backward cell. 
+
+    Notes:
+        This class is a low level API for wrapping rnn cells into a BiRNN 
+        network. Users should take care of the states of the cells. 
+        If `initial_states` is passed to the `forward` method, make sure that 
+        it satisfies the requirements of the cells.
 
     Examples:
 
@@ -791,6 +861,10 @@ def __init__(self, cell_fw, cell_bw, time_major=False):
         super(BiRNN, self).__init__()
         self.cell_fw = cell_fw
         self.cell_bw = cell_bw
+        if cell_fw.input_size != cell_bw.input_size:
+            raise ValueError("input size of forward cell({}) does not equals"
+                             "that of backward cell({})".format(
+                                 cell_fw.input_size, cell_bw.input_size))
         for cell in [self.cell_fw, self.cell_bw]:
             if not hasattr(cell, "call"):
                 # for non-dygraph mode, `rnn` api uses cell.call
@@ -810,13 +884,13 @@ def forward(self,
 
         outputs, final_states = F.birnn(self.cell_fw, self.cell_bw, inputs,
                                         initial_states, sequence_length,
-                                        self.time_major)
+                                        self.time_major, **kwargs)
         return outputs, final_states
 
 
 class RNNMixin(LayerList):
     r"""
-    A Mixin class for RNN networks. It provides forward method for SimpleRNN,
+    A Mixin class for RNN networks. It provides `forward` method for SimpleRNN,
     LSTM and GRU.
     """
 
@@ -843,7 +917,10 @@ def forward(self, inputs, initial_states=None, sequence_length=None):
         for i, rnn_layer in enumerate(self):
             if i > 0:
                 inputs = F.dropout(
-                    inputs, self.dropout, mode="upscale_in_train")
+                    inputs,
+                    self.dropout,
+                    training=self.training,
+                    mode="upscale_in_train")
             outputs, final_state = rnn_layer(inputs, states[i], sequence_length)
             final_states.append(final_state)
             inputs = outputs
@@ -855,14 +932,14 @@ def forward(self, inputs, initial_states=None, sequence_length=None):
 
 class SimpleRNN(RNNMixin):
     r"""
-    Multilayer Elman network(SimpleRNN). It takes a sequence and an initial 
-    state as inputs, and returns the output sequence and the final state.
+    Multilayer Elman network(SimpleRNN). It takes input sequences and initial 
+    states as inputs, and returns the output sequences and the final states.
 
-    Each layer inside the SimpleRNN maps the input sequence and initial state 
-    to the output sequence and final state in the following manner: at each 
-    step, it takes step input(:math:`x_{t}`) and previous 
-    state(:math:`h_{t-1}`) as inputs, and returns step output(:math:`y_{t}`)
-    and new state(:math:`h_{t}`).
+    Each layer inside the SimpleRNN maps the input sequences and initial states 
+    to the output sequences and final states in the following manner: at each 
+    step, it takes step inputs(:math:`x_{t}`) and previous 
+    states(:math:`h_{t-1}`) as inputs, and returns step outputs(:math:`y_{t}`)
+    and new states(:math:`h_{t}`).
 
     .. math::
 
@@ -875,23 +952,23 @@ class SimpleRNN(RNNMixin):
     Arguments:
         input_size (int): The input size for the first layer's cell.
         hidden_size (int): The hidden size for each layer's cell.
-        num_layers (int): Number of layers. Defaults to 1.
-        nonlinearity (str): The activation in each SimpleRNN cell. It can be 
+        num_layers (int, optional): Number of layers. Defaults to 1.
+        nonlinearity (str, optional): The activation in each SimpleRNN cell. It can be 
             `tanh` or `relu`. Defaults to `tanh`.
-        direction (str): The direction of the network. It can be "forward", 
+        direction (str, optional): The direction of the network. It can be "forward", 
             "backward" and "bidirectional". Defaults to "forward".
-        dropout (float): The droput probability. Dropout is applied to the 
-            input of each layer except for the first layer.
-        time_major (bool): Whether the first dimension of the input means the
-            time steps.
+        dropout (float, optional): The droput probability. Dropout is applied to the 
+            input of each layer except for the first layer. Defaults to 0.
+        time_major (bool, optional): Whether the first dimension of the input means the
+            time steps. Defaults to False.
         weight_ih_attr (ParamAttr, optional): The parameter attribute for 
-            `weight_ih` of each cell. Default: None.
+            `weight_ih` of each cell. Defaults to None.
         weight_hh_attr (ParamAttr, optional): The parameter attribute for 
-            `weight_hh` of each cell. Default: None.
+            `weight_hh` of each cell. Defaults to None.
         bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
-            `bias_ih` of each cells. Default: None.
+            `bias_ih` of each cells. Defaults to None.
         bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
-            `bias_hh` of each cells. Default: None.
+            `bias_hh` of each cells. Defaults to None.
         name (str, optional): Name for the operation (optional, default is 
             None). For more information, please refer to :ref:`api_guide_Name`.
 
@@ -903,18 +980,24 @@ class SimpleRNN(RNNMixin):
             `[num_lauers * num_directions, batch_size, hidden_size]`. 
             If initial_state is not given, zero initial states are used.
         sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
-            or int32. The valid lengths of input sequences.
+            or int32. The valid lengths of input sequences. Defaults to None.
             If `sequence_length` is not None, the inputs are treated as 
-            padded sequences. In each input sequence, elements whos time step 
+            padded sequences. In each input sequence, elements whose time step 
             index are not less than the valid length are treated as paddings.
 
-    Outputs:
+    Returns:
         (outputs, final_states)
         outputs (Tensor): the output sequence. 
-            If `time_major` is True, the shape is `[time_steps, batch_size, hidden_size]`,
-            else, the shape is `[batch_size, time_steps, hidden_size]`.
+            If `time_major` is True, the shape is 
+            `[time_steps, batch_size, num_directions * hidden_size]`,
+            else, the shape is 
+            `[batch_size, time_steps, num_directions * hidden_size]`.
+            Note that `num_directions` is 2 if direction is "bidirectional" 
+            else 1.
         final_states (Tensor): final states. The shape is
             `[num_lauers * num_directions, batch_size, hidden_size]`.
+            Note that `num_directions` is 2 if direction is "bidirectional" 
+            else 1.
 
     Examples:
 
@@ -990,13 +1073,13 @@ def __init__(self,
 class LSTM(RNNMixin):
     r"""
     Multilayer LSTM. It takes a sequence and an initial state as inputs, and 
-    returns the output sequence and the final state.
+    returns the output sequences and the final states.
 
-    Each layer inside the LSTM maps the input sequence and initial state 
-    to the output sequence and final state in the following manner: at each 
-    step, it takes step input(:math:`x_{t}`) and previous 
-    state(:math:`h_{t-1}, c_{t-1}`) as inputs, and returns step 
-    output(:math:`y_{t}`) and new state(:math:`h_{t}, c_{t}`).
+    Each layer inside the LSTM maps the input sequences and initial states 
+    to the output sequences and final states in the following manner: at each 
+    step, it takes step inputs(:math:`x_{t}`) and previous 
+    states(:math:`h_{t-1}, c_{t-1}`) as inputs, and returns step 
+    outputs(:math:`y_{t}`) and new states(:math:`h_{t}, c_{t}`).
 
     .. math::
 
@@ -1014,13 +1097,13 @@ class LSTM(RNNMixin):
     Arguments:
         input_size (int): The input size for the first layer's cell.
         hidden_size (int): The hidden size for each layer's cell.
-        num_layers (int): Number of layers. Defaults to 1.
-        direction (str): The direction of the network. It can be "forward", 
-            "backward" and "bidirectional". Defaults to "forward".
-        dropout (float): The droput probability. Dropout is applied to the 
-            input of each layer except for the first layer.
-        time_major (bool): Whether the first dimension of the input means the
-            time steps.
+        num_layers (int, optional): Number of layers. Defaults to 1.
+        direction (str, optional): The direction of the network. It can be 
+            "forward", "backward" and "bidirectional". Defaults to "forward".
+        dropout (float, optional): The droput probability. Dropout is applied 
+            to the input of each layer except for the first layer. Defaults to 0.
+        time_major (bool, optional): Whether the first dimension of the input 
+            means the time steps. Defaults to False.
         weight_ih_attr (ParamAttr, optional): The parameter attribute for 
             `weight_ih` of each cell. Default: None.
         weight_hh_attr (ParamAttr, optional): The parameter attribute for 
@@ -1040,18 +1123,25 @@ class LSTM(RNNMixin):
             the shape of each is `[num_lauers * num_directions, batch_size, hidden_size]`. 
             If initial_state is not given, zero initial states are used.
         sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
-            or int32. The valid lengths of input sequences.
+            or int32. The valid lengths of input sequences. Defaults to None.
             If `sequence_length` is not None, the inputs are treated as 
             padded sequences. In each input sequence, elements whos time step 
             index are not less than the valid length are treated as paddings.
 
-    Outputs:
+    Returns:
         (outputs, final_states)
         outputs (Tensor): the output sequence. 
-            If `time_major` is True, the shape is `[time_steps, batch_size, hidden_size]`,
-            else, the shape is `[batch_size, time_steps, hidden_size]`.
-        final_states (Tensor): the final state, a tuple of (h, c), 
-            the shape of each is `[num_lauers * num_directions, batch_size, hidden_size]`.
+            If `time_major` is True, the shape is 
+            `[time_steps, batch_size, num_directions * hidden_size]`, 
+            If `time_major` is False, the shape is 
+            `[batch_size, time_steps, num_directions * hidden_size]`. 
+            Note that `num_directions` is 2 if direction is "bidirectional" 
+            else 1. 
+        final_states (Tensor): the final state, a tuple of two tensors, h and c. 
+            The shape of each is 
+            `[num_lauers * num_directions, batch_size, hidden_size]`. 
+            Note that `num_directions` is 2 if direction is "bidirectional" 
+            else 1.
 
     Examples:
     
@@ -1120,14 +1210,14 @@ def __init__(self,
 
 class GRU(RNNMixin):
     r"""
-    Multilayer GRU. It takes a sequence and an initial state as inputs, and 
-    returns the output sequence and the final state.
+    Multilayer GRU. It takes input sequencse and initial states as inputs, and 
+    returns the output sequences and the final states.
 
-    Each layer inside the GRU maps the input sequence and initial state 
-    to the output sequence and final state in the following manner: at each 
-    step, it takes step input(:math:`x_{t}`) and previous 
-    state(:math:`h_{t-1}`) as inputs, and returns step output(:math:`y_{t}`) 
-    and new state(:math:`h_{t}`).
+    Each layer inside the GRU maps the input sequences and initial states 
+    to the output sequences and final states in the following manner: at each 
+    step, it takes step inputs(:math:`x_{t}`) and previous 
+    states(:math:`h_{t-1}`) as inputs, and returns step outputs(:math:`y_{t}`) 
+    and new states(:math:`h_{t}`).
 
     .. math::
 
@@ -1143,13 +1233,13 @@ class GRU(RNNMixin):
     Arguments:
         input_size (int): The input size for the first layer's cell.
         hidden_size (int): The hidden size for each layer's cell.
-        num_layers (int): Number of layers. Defaults to 1.
-        direction (str): The direction of the network. It can be "forward", 
-            "backward" and "bidirectional". Defaults to "forward".
-        dropout (float): The droput probability. Dropout is applied to the 
-            input of each layer except for the first layer.
-        time_major (bool): Whether the first dimension of the input means the
-            time steps.
+        num_layers (int, optional): Number of layers. Defaults to 1.
+        direction (str, optional): The direction of the network. It can be 
+            "forward", "backward" and "bidirectional". Defaults to "forward".
+        dropout (float, optional): The droput probability. Dropout is applied 
+            to the input of each layer except for the first layer. Defaults to 0.
+        time_major (bool, optional): Whether the first dimension of the input 
+            means the time steps. Defaults to False.
         weight_ih_attr (ParamAttr, optional): The parameter attribute for 
             `weight_ih` of each cell. Default: None.
         weight_hh_attr (ParamAttr, optional): The parameter attribute for 
@@ -1167,20 +1257,27 @@ class GRU(RNNMixin):
             else, the shape is `[batch_size, time_steps, hidden_size]`.
         initial_states (Tensor, optional): the initial state. The shape is
             `[num_lauers * num_directions, batch_size, hidden_size]`. 
-            If initial_state is not given, zero initial states are used.
+            If initial_state is not given, zero initial states are used. 
+            Defaults to None.
         sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
-            or int32. The valid lengths of input sequences.
+            or int32. The valid lengths of input sequences. Defaults to None.
             If `sequence_length` is not None, the inputs are treated as 
             padded sequences. In each input sequence, elements whos time step 
             index are not less than the valid length are treated as paddings.
 
-    Outputs:
+    Returns:
         (outputs, final_states)
         outputs (Tensor): the output sequence. 
-            If `time_major` is True, the shape is `[time_steps, batch_size, hidden_size]`,
-            else, the shape is `[batch_size, time_steps, hidden_size]`.
+            If `time_major` is True, the shape is 
+            `[time_steps, batch_size, num_directions * hidden_size]`,
+            else, the shape is 
+            `[batch_size, time_steps, num_directions * hidden_size]`.
+            Note that `num_directions` is 2 if direction is "bidirectional" 
+            else 1.
         final_states (Tensor): final states. The shape is
             `[num_lauers * num_directions, batch_size, hidden_size]`.
+            Note that `num_directions` is 2 if direction is "bidirectional" 
+            else 1.
 
     Examples:
 

From d843db8d783ba1bf77c06ea708229b35ba24aa86 Mon Sep 17 00:00:00 2001
From: chenfeiyu <chenfeiyu@baidu.com>
Date: Wed, 26 Aug 2020 16:40:22 +0800
Subject: [PATCH 12/14] update docstrings for birnn

---
 python/paddle/fluid/layers/rnn.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index d1b0e4961138c..632569fa4fbe3 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -658,13 +658,14 @@ def birnn(cell_fw, cell_bw, inputs, initial_states, sequence_length, time_major,
     along the last axis.
 
     Arguments:
-        cell(RNNCellBase): An instance of `RNNCellBase`.
-        inputs(Tensor): A (possibly nested structure of) tensor[s]. 
-            The shape of tensor should be `[batch_size, sequence_length, ...]`
-            for `time_major == False` or `[sequence_length, batch_size, ...]`
-            for `time_major == True`. It represents the inputs to be unrolled
-            in RNN.
-        initial_states(tuple, optional): A tuple of 
+        cell_fw(RNNCellBase): An instance of `RNNCellBase`.
+        cell_bw(RNNCellBase): An instance of `RNNCellBase`.
+        inputs(Tensor): the input sequences. 
+            If time_major is True, the shape is 
+            `[time_steps, batch_size, input_size]`
+            else the shape is `[batch_size, time_steps, input_size]`.
+        initial_states(tuple, optional): A tuple of initial states of 
+            `cell_fw` and `cell_bw`.
             If not provided, `cell.get_initial_states` would be called to 
             produce initial state for each cell. Defaults to None.
         sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 

From d0f9fba010edeffec711746b8cfaaf95fb58fcd3 Mon Sep 17 00:00:00 2001
From: chenfeiyu <chenfeiyu@baidu.com>
Date: Thu, 27 Aug 2020 09:45:55 +0800
Subject: [PATCH 13/14] rename argument for SimpleRNN and SimpleRNNCell, fix
 sample code

---
 python/paddle/fluid/layers/rnn.py | 28 ++++++++++++++-----
 python/paddle/nn/layer/rnn.py     | 45 ++++++++++++++++---------------
 2 files changed, 44 insertions(+), 29 deletions(-)

diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index 632569fa4fbe3..85de86a42c0ad 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -648,7 +648,12 @@ def _switch_grad(x, stop=False):
     return (final_outputs, final_states)
 
 
-def birnn(cell_fw, cell_bw, inputs, initial_states, sequence_length, time_major,
+def birnn(cell_fw,
+          cell_bw,
+          inputs,
+          initial_states,
+          sequence_length=None,
+          time_major=False,
           **kwargs):
     """
     birnn creates a bidirectional recurrent neural network specified by 
@@ -686,8 +691,7 @@ def birnn(cell_fw, cell_bw, inputs, initial_states, sequence_length, time_major,
             else the shape is `[batch_size, time_steps, size]`, where size is
             `cell_fw.hidden_size + cell_bw.hidden_size`.
         final_states (tuple): A tuple of the final states of the forward 
-            cell and backward cell. 
-            
+            cell and backward cell.        
 
     Examples:
 
@@ -696,12 +700,22 @@ def birnn(cell_fw, cell_bw, inputs, initial_states, sequence_length, time_major,
             import paddle
             paddle.disable_static()
 
-            cell_fw = LSTMCell(16, 32)
-            cell_bw = LSTMCell(16, 32)
-            inputs = paddle.rand((2, 23, 16))
-            outputs, final_states = paddle.nn.functional.birnn(cell_fw, cell_bw, inputs)
+            cell_fw = paddle.nn.LSTMCell(16, 32)
+            cell_bw = paddle.nn.LSTMCell(16, 32)
+
+            inputs = paddle.rand((4, 23, 16))
+            hf, cf = paddle.rand((4, 32)), paddle.rand((4, 32))
+            hb, cb = paddle.rand((4, 32)), paddle.rand((4, 32))
+            initial_states = ((hf, cf), (hb, cb))
+            outputs, final_states = paddle.nn.functional.birnn(
+                cell_fw, cell_bw, inputs, initial_states)
         
     """
+    if initial_states is None:
+        state_fw = cell_fw.get_initial_states(
+            batch_ref=inputs, batch_dim_idx=1 if time_major else 0)
+        state_bw = cell_fw.get_initial_states(
+            batch_ref=inputs, batch_dim_idx=1 if time_major else 0)
     states_fw, states_bw = initial_states
     outputs_fw, states_fw = rnn(cell_fw,
                                 inputs,
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index 2f5756459709a..6f1c5f199ac99 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -280,7 +280,7 @@ class SimpleRNNCell(RNNCellBase):
     Arguments:
         input_size (int): The input size.
         hidden_size (int): The hidden size.
-        nonlinearity (str, optional): The activation in the SimpleRNN cell. 
+        activation (str, optional): The activation in the SimpleRNN cell. 
             It can be `tanh` or `relu`. Defaults to `tanh`.
         weight_ih_attr (ParamAttr, optional): The parameter attribute for 
             `weight_ih`. Default: None.
@@ -342,7 +342,7 @@ class SimpleRNNCell(RNNCellBase):
     def __init__(self,
                  input_size,
                  hidden_size,
-                 nonlinearity="tanh",
+                 activation="tanh",
                  weight_ih_attr=None,
                  weight_hh_attr=None,
                  bias_ih_attr=None,
@@ -371,13 +371,13 @@ def __init__(self,
 
         self.input_size = input_size
         self.hidden_size = hidden_size
-        if nonlinearity not in ["tanh", "relu"]:
+        if activation not in ["tanh", "relu"]:
             raise ValueError(
-                "nonlinearity for SimpleRNNCell should be tanh or relu, "
-                "but get {}".format(nonlinearity))
-        self.nonlinearity = nonlinearity
-        self._nonlinear_fn = paddle.tanh \
-            if nonlinearity == "tanh" \
+                "activation for SimpleRNNCell should be tanh or relu, "
+                "but get {}".format(activation))
+        self.activation = activation
+        self._activation_fn = paddle.tanh \
+            if activation == "tanh" \
             else F.relu
 
     def forward(self, inputs, states=None):
@@ -390,7 +390,7 @@ def forward(self, inputs, states=None):
         h2h = paddle.matmul(pre_h, self.weight_hh, transpose_y=True)
         if self.bias_hh is not None:
             h2h += self.bias_hh
-        h = self._nonlinear_fn(i2h + h2h)
+        h = self._activation_fn(i2h + h2h)
         return h, h
 
     @property
@@ -479,9 +479,10 @@ class LSTMCell(RNNCellBase):
 
             x = paddle.randn((4, 16))
             prev_h = paddle.randn((4, 32))
+            prev_c = paddle.randn((4, 32))
 
             cell = paddle.nn.LSTMCell(16, 32)
-            y, h = cell(x, prev_h)
+            y, (h, c) = cell(x, (prev_h, prev_c))
 
     """
 
@@ -758,7 +759,7 @@ class RNN(Layer):
             prev_h = paddle.randn((4, 32))
 
             cell = paddle.nn.SimpleRNNCell(16, 32)
-            rnn = paddle.RNN(cell)
+            rnn = paddle.nn.RNN(cell)
             outputs, final_states = rnn(inputs, prev_h)
 
     """
@@ -848,9 +849,9 @@ class BiRNN(Layer):
             import paddle
             paddle.disable_static()
 
-            cell_fw = LSTMCell(16, 32)
-            cell_bw = LSTMCell(16, 32)
-            rnn = BidirectionalRNN(cell_fw, cell_bw)
+            cell_fw = paddle.nn.LSTMCell(16, 32)
+            cell_bw = paddle.nn.LSTMCell(16, 32)
+            rnn = paddle.nn.BiRNN(cell_fw, cell_bw)
 
             inputs = paddle.rand((2, 23, 16))
             outputs, final_states = rnn(inputs)
@@ -953,7 +954,7 @@ class SimpleRNN(RNNMixin):
         input_size (int): The input size for the first layer's cell.
         hidden_size (int): The hidden size for each layer's cell.
         num_layers (int, optional): Number of layers. Defaults to 1.
-        nonlinearity (str, optional): The activation in each SimpleRNN cell. It can be 
+        activation (str, optional): The activation in each SimpleRNN cell. It can be 
             `tanh` or `relu`. Defaults to `tanh`.
         direction (str, optional): The direction of the network. It can be "forward", 
             "backward" and "bidirectional". Defaults to "forward".
@@ -1018,7 +1019,7 @@ def __init__(self,
                  input_size,
                  hidden_size,
                  num_layers=1,
-                 nonlinearity="tanh",
+                 activation="tanh",
                  direction="forward",
                  dropout=0.,
                  time_major=False,
@@ -1031,29 +1032,29 @@ def __init__(self,
 
         if direction in ["forward", "backward"]:
             is_reverse = direction == "backward"
-            cell = SimpleRNNCell(input_size, hidden_size, nonlinearity,
+            cell = SimpleRNNCell(input_size, hidden_size, activation,
                                  weight_ih_attr, weight_hh_attr, bias_ih_attr,
                                  bias_hh_attr)
             self.append(RNN(cell, is_reverse, time_major))
             for i in range(1, num_layers):
-                cell = SimpleRNNCell(hidden_size, hidden_size, nonlinearity,
+                cell = SimpleRNNCell(hidden_size, hidden_size, activation,
                                      weight_ih_attr, weight_hh_attr,
                                      bias_ih_attr, bias_hh_attr)
                 self.append(RNN(cell, is_reverse, time_major))
         elif direction == "bidirectional":
-            cell_fw = SimpleRNNCell(input_size, hidden_size, nonlinearity,
+            cell_fw = SimpleRNNCell(input_size, hidden_size, activation,
                                     weight_ih_attr, weight_hh_attr,
                                     bias_ih_attr, bias_hh_attr)
-            cell_bw = SimpleRNNCell(input_size, hidden_size, nonlinearity,
+            cell_bw = SimpleRNNCell(input_size, hidden_size, activation,
                                     weight_ih_attr, weight_hh_attr,
                                     bias_ih_attr, bias_hh_attr)
             self.append(BiRNN(cell_fw, cell_bw, time_major))
             for i in range(1, num_layers):
                 cell_fw = SimpleRNNCell(
-                    2 * hidden_size, hidden_size, nonlinearity, weight_ih_attr,
+                    2 * hidden_size, hidden_size, activation, weight_ih_attr,
                     weight_hh_attr, bias_ih_attr, bias_hh_attr)
                 cell_bw = SimpleRNNCell(
-                    2 * hidden_size, hidden_size, nonlinearity, weight_ih_attr,
+                    2 * hidden_size, hidden_size, activation, weight_ih_attr,
                     weight_hh_attr, bias_ih_attr, bias_hh_attr)
                 self.append(BiRNN(cell_fw, cell_bw, time_major))
         else:

From db45fa3aa30376d24c64de0406aec64e60d7859c Mon Sep 17 00:00:00 2001
From: chenfeiyu <chenfeiyu@baidu.com>
Date: Thu, 27 Aug 2020 15:34:36 +0800
Subject: [PATCH 14/14] add default value for initial_states in
 fluid.layers.birnn

---
 python/paddle/fluid/layers/rnn.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index 85de86a42c0ad..fe8ed83923e88 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -651,7 +651,7 @@ def _switch_grad(x, stop=False):
 def birnn(cell_fw,
           cell_bw,
           inputs,
-          initial_states,
+          initial_states=None,
           sequence_length=None,
           time_major=False,
           **kwargs):
@@ -712,11 +712,12 @@ def birnn(cell_fw,
         
     """
     if initial_states is None:
-        state_fw = cell_fw.get_initial_states(
+        states_fw = cell_fw.get_initial_states(
             batch_ref=inputs, batch_dim_idx=1 if time_major else 0)
-        state_bw = cell_fw.get_initial_states(
+        states_bw = cell_fw.get_initial_states(
             batch_ref=inputs, batch_dim_idx=1 if time_major else 0)
-    states_fw, states_bw = initial_states
+    else:
+        states_fw, states_bw = initial_states
     outputs_fw, states_fw = rnn(cell_fw,
                                 inputs,
                                 states_fw,