From eb811857226d7c48e51b04fe5ab596db9b23ed12 Mon Sep 17 00:00:00 2001 From: guosheng Date: Mon, 10 Aug 2020 12:33:00 +0800 Subject: [PATCH 01/14] Add RNN related apis in paddl.nn test=develop --- python/paddle/nn/layer/__init__.py | 1 + python/paddle/nn/layer/rnn.py | 1190 +++++++++++++++++++++++++++- 2 files changed, 1187 insertions(+), 4 deletions(-) diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py index 4963ac360804f..0b8f9fa5a0bc9 100644 --- a/python/paddle/nn/layer/__init__.py +++ b/python/paddle/nn/layer/__init__.py @@ -27,6 +27,7 @@ from .extension import * from .activation import * from .norm import * +from .rnn import * # from .activation import PReLU #DEFINE_ALIAS from .activation import ReLU #DEFINE_ALIAS from .activation import LeakyReLU #DEFINE_ALIAS diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py index 4717609503f7f..b4ce7678cb747 100644 --- a/python/paddle/nn/layer/rnn.py +++ b/python/paddle/nn/layer/rnn.py @@ -12,10 +12,1192 @@ # See the License for the specific language governing permissions and # limitations under the License. -# TODO: define classes of recurrent neural network +# TODO: define classes of recurrent neural network __all__ = [ - # 'RNNCell', - # 'GRUCell', - # 'LSTMCell' + 'RNNCellBase', + 'LSTMCell', + 'GRUCell', + 'StackedRNNCell', + 'StackedLSTMCell', + 'stackedGRUCell', + 'RNN', + 'BidirectionalRNN', + 'LSTM', + 'GRU', ] + +import copy +import collections +import itertools +import six +import sys +import warnings +from functools import partial, reduce + +import numpy as np + +from ... import fluid +from ...fluid import layers +from ...fluid.data_feeder import convert_dtype +from ...fluid.dygraph import Layer, LayerList +from ...fluid.param_attr import ParamAttr +from ...fluid.layers import utils, BeamSearchDecoder +from ...fluid.layers.utils import map_structure, flatten, pack_sequence_as + + +class RNNCellBase(Layer): + """ + RNNCellBase is the base class for abstraction representing the calculations + mapping the input and state to the output and new state. It is suitable to + and mostly used in RNN. + """ + + def get_initial_states(self, + batch_ref, + shape=None, + dtype=None, + init_value=0, + batch_dim_idx=0): + """ + Generate initialized states according to provided shape, data type and + value. + Parameters: + batch_ref: A (possibly nested structure of) tensor variable[s]. + The first dimension of the tensor will be used as batch size to + initialize states. + shape: A (possibly nested structure of) shape[s], where a shape is + represented as a list/tuple of integer). -1(for batch size) will + beautomatically inserted if shape is not started with it. If None, + property `state_shape` will be used. The default value is None. + dtype: A (possibly nested structure of) data type[s]. The structure + must be same as that of `shape`, except when all tensors' in states + has the same data type, a single data type can be used. If None and + property `cell.state_shape` is not available, float32 will be used + as the data type. The default value is None. + init_value: A float value used to initialize states. + batch_dim_idx: An integer indicating which dimension of the tensor in + inputs represents batch size. The default value is 0. + Returns: + Variable: tensor variable[s] packed in the same structure provided \ + by shape, representing the initialized states. + """ + # TODO: use inputs and batch_size + batch_ref = flatten(batch_ref)[0] + + def _is_shape_sequence(seq): + if sys.version_info < (3, ): + integer_types = ( + int, + long, ) + else: + integer_types = (int, ) + """For shape, list/tuple of integer is the finest-grained objection""" + if (isinstance(seq, list) or isinstance(seq, tuple)): + if reduce(lambda flag, x: isinstance(x, integer_types) and flag, + seq, True): + return False + # TODO: Add check for the illegal + if isinstance(seq, dict): + return True + return (isinstance(seq, collections.Sequence) and + not isinstance(seq, six.string_types)) + + class Shape(object): + def __init__(self, shape): + self.shape = shape if shape[0] == -1 else ([-1] + list(shape)) + + # nested structure of shapes + states_shapes = self.state_shape if shape is None else shape + is_sequence_ori = utils.is_sequence + utils.is_sequence = _is_shape_sequence + states_shapes = map_structure(lambda shape: Shape(shape), states_shapes) + utils.is_sequence = is_sequence_ori + + # nested structure of dtypes + try: + states_dtypes = self.state_dtype if dtype is None else dtype + except NotImplementedError: # use fp32 as default + states_dtypes = "float32" + if len(flatten(states_dtypes)) == 1: + dtype = flatten(states_dtypes)[0] + states_dtypes = map_structure(lambda shape: dtype, states_shapes) + + init_states = map_structure( + lambda shape, dtype: layers.fill_constant_batch_size_like( + input=batch_ref, + shape=shape.shape, + dtype=dtype, + value=init_value, + input_dim_idx=batch_dim_idx), states_shapes, states_dtypes) + return init_states + + @property + def state_shape(self): + """ + Abstract method (property). + Used to initialize states. + A (possiblely nested structure of) shape[s], where a shape is represented + as a list/tuple of integers (-1 for batch size would be automatically + inserted into a shape if shape is not started with it). + Not necessary to be implemented if states are not initialized by + `get_initial_states` or the `shape` argument is provided when using + `get_initial_states`. + """ + raise NotImplementedError( + "Please add implementaion for `state_shape` in the used cell.") + + @property + def state_dtype(self): + """ + Abstract method (property). + Used to initialize states. + A (possiblely nested structure of) data types[s]. The structure must be + same as that of `shape`, except when all tensors' in states has the same + data type, a signle data type can be used. + Not necessary to be implemented if states are not initialized + by `get_initial_states` or the `dtype` argument is provided when using + `get_initial_states`. + """ + raise NotImplementedError( + "Please add implementaion for `state_dtype` in the used cell.") + + +class LSTMCell(RNNCellBase): + """ + Long-Short Term Memory(LSTM) RNN cell. + + The formula used is as follows: + + .. math:: + i_{t} & = \sigma(W_{x_{i}}x_{t} + b_{x_{i}} + W_{h_{i}}h_{t-1} + b_{h_{i}}) + f_{t} & = \sigma(W_{x_{f}}x_{t} + b_{x_{f}} + W_{h_{f}}h_{t-1} + b_{h_{f}}) + o_{t} & = \sigma(W_{x_{o}}x_{t} + b_{x_{o}} + W_{h_{o}}h_{t-1} + b_{h_{o}}) + c_{t} & = f_{t}c_{t-1} + i_{t} \\tanh (W_{x_{c}}x_{t} + b_{x_{c}} + W_{h_{c}}h_{t-1} + b_{h_{c}}) + h_{t} & = o_{t} \\tanh (c_{t}) + + Please refer to `An Empirical Exploration of Recurrent Network Architectures + `_ for more details. + + Parameters: + input_size (int): The input size in the LSTM cell. + hidden_size (int): The hidden size in the LSTM cell. + param_attr(ParamAttr, optional): The parameter attribute for the learnable + weight matrix. Default: None. + bias_attr (ParamAttr, optional): The parameter attribute for the bias + of LSTM. Default: None. + dtype(string, optional): The data type used in this cell. Default float32. + + Examples: + .. code-block:: python + import paddle + inputs = paddle.rand((2, 4, 32)) + cell = paddle.LSTMCell(input_size=32, hidden_size=64) + rnn = paddle.RNN(cell=cell) + outputs, _ = rnn(inputs) # [2, 4, 64] + """ + + def __init__(self, + input_size, + hidden_size, + param_attr=None, + bias_attr=None, + dtype="float32"): + super(LSTMCell, self).__init__(dtype) + + self.hidden_size = hidden_size + self.input_size = input_size + self._gate_activation = getattr(layers, "sigmoid") + self._activation = getattr(layers, "tanh") + self._param_attr = ParamAttr._to_attr(param_attr) + self._bias_attr = ParamAttr._to_attr(bias_attr) + self._dtype = dtype + + if self._param_attr and self._param_attr.name is not None: + weight_ih_param_attr = copy.deepcopy(self._param_attr) + weight_hh_param_attr = copy.deepcopy(self._param_attr) + weight_ih_param_attr.name += "_weight_ih" + weight_hh_param_attr.name += "_weight_hh" + else: + weight_ih_param_attr = self._param_attr + weight_hh_param_attr = self._param_attr + + if self._bias_attr and self._bias_attr.name is not None: + bias_ih_param_attr = copy.deepcopy(self._bias_attr) + bias_hh_param_attr = copy.deepcopy(self._bias_attr) + bias_ih_param_attr.name += "_bias_ih" + bias_hh_param_attr.name += "_bias_hh" + else: + bias_ih_param_attr = self._bias_attr + bias_hh_param_attr = self._bias_attr + + self.weight_ih = self.create_parameter( + attr=weight_ih_param_attr, + shape=[4 * hidden_size, input_size], + dtype=dtype) + + self.weight_hh = self.create_parameter( + attr=weight_hh_param_attr, + shape=[4 * hidden_size, hidden_size], + dtype=dtype) + + self.bias_ih = self.create_parameter( + attr=bias_ih_param_attr, + shape=[4 * hidden_size], + dtype=dtype, + is_bias=True) + self.bias_hh = self.create_parameter( + attr=bias_hh_param_attr, + shape=[4 * hidden_size], + dtype=dtype, + is_bias=True) + + def forward(self, inputs, states): + """ + Performs single step LSTM calculations. + Parameters: + inputs (Variable): A tensor with shape `[batch_size, input_size]`, + corresponding to :math:`x_t` in the formula. The data type + should be float32 or float64. + states (Variable): A list of containing two tensors, each shaped + `[batch_size, hidden_size]`, corresponding to :math:`h_{t-1}, c_{t-1}` + in the formula. The data type should be float32 or float64. + Returns: + tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \ + a tensor with shape `[batch_size, hidden_size]`, corresponding \ + to :math:`h_{t}` in the formula; `new_states` is a list containing \ + two tenser variables shaped `[batch_size, hidden_size]`, corresponding \ + to :math:`h_{t}, c_{t}` in the formula. The data type of these \ + tensors all is same as that of `states`. + """ + pre_hidden, pre_cell = states + gates = layers.matmul(inputs, self.weight_ih, transpose_y=True) + if self.bias_ih: + gates = gates + self.bias_ih + gates += layers.matmul(pre_hidden, self.weight_hh, transpose_y=True) + if self.bias_hh: + gates = gates + self.bias_hh + + chunked_gates = layers.split(gates, num_or_sections=4, dim=1) + + i = self._gate_activation(chunked_gates[0]) + f = self._gate_activation(chunked_gates[1]) + o = self._gate_activation(chunked_gates[3]) + c = f * pre_cell + i * self._activation(chunked_gates[2]) + h = o * self._activation(c) + + return h, [h, c] + + @property + def state_shape(self): + """ + The `state_shape` of BasicLSTMCell is a list with two shapes: `[[hidden_size], [hidden_size]]` + (-1 for batch size would be automatically inserted into shape). These two + shapes correspond to :math:`h_{t-1}` and :math:`c_{t-1}` separately. + """ + return [[self.hidden_size], [self.hidden_size]] + + +class GRUCell(RNNCellBase): + """ + Gated Recurrent Unit (GRU) RNN cell. + + The formula for GRU used is as follows: + + .. math:: + + u_t & = \sigma(W_{x_{u}}x_{t} + b_{x_{u}} + W_{h_{u}}h_{t-1} + b_{h_{u}}) + + r_t & = \sigma(W_{x_{r}}x_{t} + b_{x_{r}} + W_{h_{r}}h_{t-1} + b_{h_{r}}) + + \\tilde{h_t} & = \\tanh(W_{x_{c}}x_{t} + r_t \odot (W_{h_{c}}h_{t-1} + b_{h_{c}}) + + h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t} + + Please refer to `An Empirical Exploration of Recurrent Network Architectures + `_ for more details. + + Parameters: + input_size (int): The input size for the first GRU cell. + hidden_size (int): The hidden size for every GRU cell. + param_attr(ParamAttr, optional): The parameter attribute for the learnable + weight matrix. Default: None. + bias_attr (ParamAttr, optional): The parameter attribute for the bias + of LSTM. Default: None. + dtype(string, optional): The data type used in this cell. Default float32. + + Examples: + + .. code-block:: python + + import paddle + inputs = paddle.rand((2, 4, 32)) + cell = BasicGRUCell(input_size=32, hidden_size=64) + rnn = RNN(cell=cell) + outputs, _ = rnn(inputs) # [2, 4, 64] + """ + + def __init__(self, + input_size, + hidden_size, + param_attr=None, + bias_attr=None, + dtype='float32'): + super(GRUCell, self).__init__() + + self.hidden_size = hidden_size + self.input_size = input_size + self._gate_activation = getattr(layers, "sigmoid") + self._activation = getattr(layers, "tanh") + self._param_attr = ParamAttr._to_attr(param_attr) + self._bias_attr = ParamAttr._to_attr(bias_attr) + self._dtype = dtype + + if self._param_attr and self._param_attr.name is not None: + weight_ih_param_attr = copy.deepcopy(self._param_attr) + weight_hh_param_attr = copy.deepcopy(self._param_attr) + weight_ih_param_attr.name += "_weight_ih" + weight_hh_param_attr.name += "_weight_hh" + else: + weight_ih_param_attr = self._param_attr + weight_hh_param_attr = self._param_attr + + if self._bias_attr and self._bias_attr.name is not None: + bias_ih_param_attr = copy.deepcopy(self._bias_attr) + bias_hh_param_attr = copy.deepcopy(self._bias_attr) + bias_ih_param_attr.name += "_bias_ih" + bias_hh_param_attr.name += "_bias_hh" + else: + bias_ih_param_attr = self._bias_attr + bias_hh_param_attr = self._bias_attr + + self.weight_ih = self.create_parameter( + attr=weight_ih_param_attr, + shape=[3 * hidden_size, input_size], + dtype=dtype) + + self.weight_hh = self.create_parameter( + attr=weight_hh_param_attr, + shape=[3 * hidden_size, hidden_size], + dtype=dtype) + + self.bias_ih = self.create_parameter( + attr=bias_ih_param_attr, + shape=[3 * hidden_size], + dtype=dtype, + is_bias=True) + self.bias_hh = self.create_parameter( + attr=bias_hh_param_attr, + shape=[3 * hidden_size], + dtype=dtype, + is_bias=True) + + def forward(self, inputs, states): + """ + Performs single step GRU calculations. + + Parameters: + inputs (Variable): A tensor with shape `[batch_size, input_size]`, + corresponding to :math:`x_t` in the formula. The data type + should be float32 or float64. + states (Variable): A tensor with shape `[batch_size, hidden_size]`. + corresponding to :math:`h_{t-1}` in the formula. The data type + should be float32 or float64. + + Returns: + tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` and \ + `new_states` is the same tensor shaped `[batch_size, hidden_size]`, \ + corresponding to :math:`h_t` in the formula. The data type of the \ + tensor is same as that of `states`. + """ + pre_hidden = states + + x_gates = layers.matmul(inputs, self.weight_ih, transpose_y=True) + if self.bias_ih: + x_gates = x_gates + self.bias_ih + h_gates = layers.matmul(pre_hidden, self.weight_hh, transpose_y=True) + if self.bias_hh: + h_gates = h_gates + self.bias_hh + + x_u, x_r, x_c = layers.split(x_gates, num_or_sections=3, dim=1) + h_u, h_r, h_c = layers.split(x_gates, num_or_sections=3, dim=1) + + u = self._gate_activation(x_u + h_u) + r = self._gate_activation(x_r + h_r) + h_c = r * h_c + c = self._activation(x_c + h_c) + h = u * pre_hidden + (1 - u) * c + + return h, h + + @property + def state_shape(self): + """ + The `state_shape` of BasicGRUCell is a shape `[hidden_size]` (-1 for batch + size would be automatically inserted into shape). The shape corresponds + to :math:`h_{t-1}`. + """ + return [self._hidden_size] + + +class StackedRNNCell(RNNCellBase): + """ + Wrapper allowing a stack of RNN cells to behave as a single cell. It is used + to implement stacked RNNs. + + Parameters: + cells (list|tuple): List of RNN cell instances. + + Examples: + .. code-block:: python + from paddle import LSTMCell, StackedRNNCell + cells = [LSTMCell(32, 32), LSTMCell(32, 32)] + stack_rnn = StackedRNNCell(cells) + """ + + def __init__(self, cells): + super(StackedRNNCell, self).__init__() + self.cells = [] + for i, cell in enumerate(cells): + self.cells.append(self.add_sublayer("cell_%d" % i, cell)) + + def forward(self, inputs, states, **kwargs): + """ + Performs :code:`cell.forward` for all including cells sequentially. + Each cell's `inputs` is the `outputs` of the previous cell. And each + cell's `states` is the corresponding one in `states`. + + Parameters: + inputs (Variable): The inputs for the first cell. Mostly it is a + float32 or float64 tensor with shape `[batch_size, input_size]`. + states (list): A list containing states for all cells orderly. + + Returns: + tuple: A tuple( :code:`(outputs, new_states)` ). `outputs` is the \ + `outputs` of the last cell. `new_states` is a list composed \ + of all cells' `new_states`, and its structure and data type is \ + same as that of `states` argument. + """ + new_states = [] + for cell, state in zip(self.cells, states): + outputs, new_state = cell(inputs, state, **kwargs) + inputs = outputs + new_states.append(new_state) + return outputs, new_states + + @staticmethod + def stack_param_attr(param_attr, n): + """ + If `param_attr` is a list or tuple, convert every element in it to a + ParamAttr instance. Otherwise, repeat `param_attr` `n` times to + construct a list, and rename every one by appending a increasing index + suffix to avoid having same names when `param_attr` contains a name. + Parameters: + param_attr (list|tuple|ParamAttr): A list, tuple or something can be + converted to a ParamAttr instance by `ParamAttr._to_attr`. + n (int): The times to repeat to construct a list when `param_attr` + is not a list or tuple. + Returns: + list: A list composed of each including cell's `param_attr`. + """ + if isinstance(param_attr, (list, tuple)): + assert len(param_attr) == n, ( + "length of param_attr should be %d when it is a list/tuple" % n) + param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr] + else: + param_attrs = [] + attr = ParamAttr._to_attr(param_attr) + for i in range(n): + attr_i = copy.deepcopy(attr) + if attr.name: + attr_i.name = attr_i.name + "_" + str(i) + param_attrs.append(attr_i) + return param_attrs + + @property + def state_shape(self): + """ + The `state_shape` of StackedRNNCell is a list composed of each including + cell's `state_shape`. + Returns: + list: A list composed of each including cell's `state_shape`. + """ + return [cell.state_shape for cell in self.cells] + + +class StackedLSTMCell(RNNCellBase): + """ + Wrapper allowing a stack of LSTM cells to behave as a single cell. It is used + to implement stacked LSTM. + + The formula for LSTM used here is as follows: + + .. math:: + i_{t} & = \sigma(W_{x_{i}}x_{t} + b_{x_{i}} + W_{h_{i}}h_{t-1} + b_{h_{i}}) + f_{t} & = \sigma(W_{x_{f}}x_{t} + b_{x_{f}} + W_{h_{f}}h_{t-1} + b_{h_{f}}) + o_{t} & = \sigma(W_{x_{o}}x_{t} + b_{x_{o}} + W_{h_{o}}h_{t-1} + b_{h_{o}}) + c_{t} & = f_{t}c_{t-1} + i_{t} \\tanh (W_{x_{c}}x_{t} + b_{x_{c}} + W_{h_{c}}h_{t-1} + b_{h_{c}}) + h_{t} & = o_{t} \\tanh (c_{t}) + + Parameters: + input_size (int): The input size for the first LSTM cell. + hidden_size (int): The hidden size for every LSTM cell. + num_layers(int, optional): The number of LSTM to be stacked. Default 1. + dropout(float, optional): The dropout probability applied on the outputs + of each LSTM cell except the last one. 0 for no dropout. Default 0.0 + param_attr (list|tuple|ParamAttr): A list, tuple or something can be + converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is + a list or tuple, it's length must equal to `num_layers`. Otherwise, + construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`. + Default None. + bias_attr (list|tuple|ParamAttr): A list, tuple or something can be + converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is + a list or tuple, it's length must equal to `num_layers`. Otherwise, + construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`. + Default None. + dtype(string, optional): The data type used in this cell. It can be + float32 or float64. Default float32. + + Examples: + .. code-block:: python + import paddle + inputs = paddle.rand((2, 4, 32)) + cell = paddle.StackedLSTMCell(input_size=32, hidden_size=64) + rnn = paddle.RNN(cell=cell) + outputs, _ = rnn(inputs) # [2, 4, 64] + """ + + def __init__(self, + input_size, + hidden_size, + num_layers=1, + dropout=0.0, + param_attr=None, + bias_attr=None, + dtype="float32"): + super(StackedLSTMCell, self).__init__() + self.hidden_size = hidden_size + self.input_size = input_size + self.num_layers = num_layers + self.dropout = dropout + param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers) + bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers) + + self.cells = [] + for i in range(num_layers): + self.cells.append( + self.add_sublayer( + "lstm_%d" % i, + LSTMCell( + input_size=input_size if i == 0 else hidden_size, + hidden_size=hidden_size, + param_attr=param_attrs[i], + bias_attr=bias_attrs[i], + dtype=dtype))) + + def forward(self, inputs, states): + """ + Performs the stacked LSTM cells sequentially. Each cell's `inputs` is + the `outputs` of the previous cell. And each cell's `states` is the + corresponding one in `states`. + + Parameters: + inputs (Variable): The inputs for the first cell. It is a float32 or + float64 tensor with shape `[batch_size, input_size]`. + states (list): A list containing states for all cells orderly. + + Returns: + tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \ + a tensor with shape `[batch_size, hidden_size]`, corresponding \ + to :math:`h_{t}` in the formula of the last LSTM; `new_states` \ + is a list composed of every LSTM `new_states` which is a pair \ + of tensors standing for :math:`h_{t}, c_{t}` in the formula, \ + and the data type and structure of these tensors all is same \ + as that of `states`. + """ + new_states = [] + for i, cell in enumerate(self.cells): + outputs, new_state = cell(inputs, states[i]) + outputs = layers.dropout( + outputs, + self.dropout, + dropout_implementation='upscale_in_train' + ) if self.dropout and i != (self.num_layers - 1) else outputs + inputs = outputs + new_states.append(new_state) + # TODO(guosheng): maybe should stack list of states as one tensor + return outputs, new_states + + @property + def state_shape(self): + """ + The `state_shape` of StackedLSTMCell is a list composed of each including + LSTM cell's `state_shape`. + Returns: + list: A list composed of each including LSTM cell's `state_shape`. + """ + return [cell.state_shape for cell in self.cells] + + +class StackedGRUCell(RNNCellBase): + """ + Wrapper allowing a stack of GRU cells to behave as a single cell. It is used + to implement stacked GRU. + + The formula for GRU used here is as follows: + + .. math:: + + u_t & = \sigma(W_{x_{u}}x_{t} + b_{x_{u}} + W_{h_{u}}h_{t-1} + b_{h_{u}}) + + r_t & = \sigma(W_{x_{r}}x_{t} + b_{x_{r}} + W_{h_{r}}h_{t-1} + b_{h_{r}}) + + \\tilde{h_t} & = \\tanh(W_{x_{c}}x_{t} + r_t \odot (W_{h_{c}}h_{t-1} + b_{h_{c}}) + + h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t} + + + Parameters: + input_size (int): The input size for the first GRU cell. + hidden_size (int): The hidden size for every GRU cell. + num_layers(int, optional): The number of GRU to be stacked. Default 1. + dropout(float, optional): The dropout probability applied on the outputs + of each GRU cell except the last one. 0 for no dropout. Default 0.0 + param_attr (list|tuple|ParamAttr): A list, tuple or something can be + converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is + a list or tuple, it's length must equal to `num_layers`. Otherwise, + construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`. + Default None. + bias_attr (list|tuple|ParamAttr): A list, tuple or something can be + converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is + a list or tuple, it's length must equal to `num_layers`. Otherwise, + construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`. + Default None. + dtype(string, optional): The data type used in this cell. It can be + float32 or float64. Default float32. + + Examples: + + .. code-block:: python + + import paddle + + inputs = paddle.rand((2, 4, 32)) + cell = paddle.StackedGRUCell(input_size=32, hidden_size=64) + rnn = paddle.RNN(cell=cell) + outputs, _ = rnn(inputs) # [2, 4, 64] + """ + + def __init__(self, + input_size, + hidden_size, + num_layers=1, + dropout=0.0, + param_attr=None, + bias_attr=None, + dtype="float32"): + super(StackedGRUCell, self).__init__() + self.hidden_size = hidden_size + self.input_size = input_size + self.num_layers = num_layers + self.dropout = dropout + param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers) + bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers) + + self.cells = [] + for i in range(num_layers): + self.cells.append( + self.add_sublayer( + "gru_%d" % i, + GRUCell( + input_size=input_size if i == 0 else hidden_size, + hidden_size=hidden_size, + param_attr=param_attrs[i], + bias_attr=bias_attrs[i], + dtype=dtype))) + + def forward(self, inputs, states): + """ + Performs the stacked GRU cells sequentially. Each cell's `inputs` is + the `outputs` of the previous cell. And each cell's `states` is the + corresponding one in `states`. + + Parameters: + inputs (Variable): The inputs for the first cell. It is a float32 or + float64 tensor with shape `[batch_size, input_size]`. + states (list): A list containing states for all cells orderly. + + Returns: + tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \ + a tensor with shape `[batch_size, hidden_size]`, corresponding \ + to :math:`h_{t}` in the formula of the last GRU; `new_states` \ + is a list composed of every GRU `new_states` which is also \ + :math:`h_{t}` in the formula, and the data type and structure \ + of these tensors all is same as that of `states`. + """ + new_states = [] + for i, cell in enumerate(self.cells): + outputs, new_state = cell(inputs, states[i]) + outputs = layers.dropout( + outputs, + self.dropout, + dropout_implementation='upscale_in_train' + ) if self.dropout and i != (self.num_layers - 1) else outputs + inputs = outputs + new_states.append(new_state) + return outputs, new_states + + @property + def state_shape(self): + """ + The `state_shape` of StackedGRUCell is a list composed of each including + GRU cell's `state_shape`. + + Returns: + list: A list composed of each including GRU cell's `state_shape`. + """ + return [cell.state_shape for cell in self.cells] + + +class RNN(Layer): + """ + RNN creates a recurrent neural network specified by RNNCell `cell`, which + performs :code:`cell.forward()` repeatedly until reaches to the maximum + length of `inputs`. + + Parameters: + cell(RNNCell): An instance of `RNNCell`. + is_reverse (bool, optional): Indicate whether to calculate in the reverse + order of input sequences. Default: `False`. + time_major (bool, optional): Indicate the data layout of Tensor included + in `input` and `output` tensors. If `False`, the data layout would + be batch major with shape `[batch_size, sequence_length, ...]`. If + `True`, the data layout would be time major with shape + `[sequence_length, batch_size, ...]`. Default: `False`. + + Examples: + .. code-block:: python + import paddle + inputs = paddle.rand((2, 4, 32)) + cell = paddle.StackedLSTMCell(input_size=32, hidden_size=64) + rnn = paddle.RNN(cell=cell) + outputs, _ = rnn(inputs) # [2, 4, 64] + """ + + def __init__(self, cell, is_reverse=False, time_major=False): + super(RNN, self).__init__() + self.cell = cell + if not hasattr(self.cell, "call"): + # for non-dygraph mode, `rnn` api uses cell.call + self.cell.call = self.cell.forward + self.is_reverse = is_reverse + self.time_major = time_major + self.batch_index, self.time_step_index = (1, 0) if time_major else (0, + 1) + + def forward(self, + inputs, + initial_states=None, + sequence_length=None, + **kwargs): + """ + Performs :code:`cell.forward()` repeatedly until reaches to the maximum + length of `inputs`. + Parameters: + inputs (Variable): A (possibly nested structure of) tensor variable[s]. + The shape of tensor should be `[batch_size, sequence_length, ...]` + for `time_major == False` or `[sequence_length, batch_size, ...]` + for `time_major == True`. It represents the inputs to be unrolled + in RNN. + initial_states (Variable, optional): A (possibly nested structure of) + tensor variable[s], representing the initial state for RNN. + If not provided, `cell.get_initial_states` would be used to produce + the initial state. Default None. + sequence_length (Variable, optional): A tensor with shape `[batch_size]`. + It stores real length of each instance, thus enables users to extract + the last valid state when past a batch element's sequence length for + correctness. If not provided, the paddings would be treated same as + non-padding inputs. Default None. + **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`. + Returns: + tuple: A tuple( :code:`(final_outputs, final_states)` ) including the final \ + outputs and states, both are Tensor or nested structure of Tensor. \ + `final_outputs` has the same structure and data types as \ + the returned `outputs` of :code:`cell.forward` , and each Tenser in `final_outputs` \ + stacks all time steps' counterpart in `outputs` thus has shape `[batch_size, sequence_length, ...]` \ + for `time_major == False` or `[sequence_length, batch_size, ...]` for `time_major == True`. \ + `final_states` is the counterpart at last time step of initial states, \ + thus has the same structure with it and has tensors with same shapes \ + and data types. + """ + flat_inputs = flatten(inputs) + batch_size, time_steps = (flat_inputs[0].shape[self.batch_index], + flat_inputs[0].shape[self.time_step_index]) + + if initial_states is None: + initial_states = self.cell.get_initial_states( + batch_ref=inputs, + dtype=self.cell.dtype if hasattr(self.cell, "dtype") else + self.cell.parameters()[0].dtype, + batch_dim_idx=self.batch_index) + + if fluid.in_dygraph_mode(): + + class ArrayWrapper(object): + def __init__(self, x): + self.array = [x] + + def append(self, x): + self.array.append(x) + return self + + def _maybe_copy(state, new_state, step_mask): + # TODO: use where_op + new_state = layers.elementwise_mul( + new_state, step_mask, axis=0) - layers.elementwise_mul( + state, (step_mask - 1), axis=0) + return new_state + + if not self.time_major: + inputs = map_structure( + lambda x: layers.transpose(x, [1, 0] + list( + range(2, len(x.shape)))), inputs) + + if sequence_length is not None: + mask = layers.sequence_mask( + sequence_length, + maxlen=time_steps, + dtype=flatten(initial_states)[0].dtype) + mask = layers.transpose(mask, [1, 0]) + + if self.is_reverse: + inputs = map_structure(lambda x: layers.reverse(x, axis=[0]), + inputs) + mask = layers.reverse( + mask, axis=[0]) if sequence_length is not None else None + + states = initial_states + outputs = [] + for i in range(time_steps): + step_inputs = map_structure(lambda x: x[i], inputs) + step_outputs, new_states = self.cell(step_inputs, states, + **kwargs) + if sequence_length is not None: + new_states = map_structure( + partial( + _maybe_copy, step_mask=mask[i]), + states, + new_states) + states = new_states + outputs = map_structure( + lambda x: ArrayWrapper(x), + step_outputs) if i == 0 else map_structure( + lambda x, x_array: x_array.append(x), step_outputs, + outputs) + + final_outputs = map_structure( + lambda x: layers.stack(x.array, axis=self.time_step_index), + outputs) + + if self.is_reverse: + final_outputs = map_structure( + lambda x: layers.reverse(x, axis=self.time_step_index), + final_outputs) + + final_states = new_states + else: + final_outputs, final_states = layers.rnn( + self.cell, + inputs, + initial_states=initial_states, + sequence_length=sequence_length, + time_major=self.time_major, + is_reverse=self.is_reverse, + **kwargs) + return final_outputs, final_states + + +class BidirectionalRNN(Layer): + """ + Wrapper for bidirectional RNN. It assembles two RNNCell instances to perform + forward and backward RNN separately, and merge outputs of these two RNN + according to `merge_mode`. + Parameters: + cell_fw (RNNCell): A RNNCell instance used for forward RNN. + cell_bw (RNNCell): A RNNCell instance used for backward RNN. + merge_mode (str|None, optional): The way to merget outputs of forward and + backward RNN. It can be `concat`, `sum`, `ave`, `mul`, `zip` and None, + where None stands for make the two `outputs` as a tuple, `zip` stands + for make each two corresponding tensors of the two `outputs` as a tuple. + Default `concat` + Examples: + .. code-block:: python + import paddle + from paddle.incubate.hapi.text import StackedLSTMCell, BidirectionalRNN + inputs = paddle.rand((2, 4, 32)) + cell_fw = StackedLSTMCell(32, 64) + cell_bw = StackedLSTMCell(32, 64) + bi_rnn = BidirectionalRNN(cell_fw, cell_bw) + outputs, _ = bi_rnn(inputs) # [2, 4, 128] + """ + + def __init__(self, + cell_fw, + cell_bw, + merge_mode='concat', + time_major=False, + cell_cls=None, + **kwargs): + super(BidirectionalRNN, self).__init__() + self.rnn_fw = RNN(cell_fw, is_reverse=False, time_major=time_major) + self.rnn_bw = RNN(cell_bw, is_reverse=True, time_major=time_major) + if merge_mode == 'concat': + self.merge_func = lambda x, y: layers.concat([x, y], -1) + elif merge_mode == 'sum': + self.merge_func = lambda x, y: layers.elementwise_add(x, y) + elif merge_mode == 'ave': + self.merge_func = lambda x, y: layers.scale( + layers.elementwise_add(x, y), 0.5) + elif merge_mode == 'mul': + self.merge_func = lambda x, y: layers.elementwise_mul(x, y) + elif merge_mode == 'zip': + self.merge_func = lambda x, y: (x, y) + elif merge_mode is None: + self.merge_func = None + else: + raise ValueError('Unsupported value for `merge_mode`: %s' % + merge_mode) + + def forward(self, + inputs, + initial_states=None, + sequence_length=None, + **kwargs): + """ + Performs forward and backward RNN separately, and merge outputs of these + two RNN according to `merge_mode`. + Parameters: + inputs (Variable): A (possibly nested structure of) tensor variable[s]. + The shape of tensor should be `[batch_size, sequence_length, ...]` + for `time_major == False` or `[sequence_length, batch_size, ...]` + for `time_major == True`. It represents the inputs to be unrolled + in both forward and backward RNN. + initial_states (Variable|list|tuple): If it is a list or tuple, its + length should be 2 to include initial states of forward and backward + RNN separately. Otherwise it would be used twice for the two RNN. + If None, `cell.get_initial_states` would be used to produce the initial + states. Default None. + sequence_length (Variable, optional): A tensor with shape `[batch_size]`. + It stores real length of each instance, thus enables users to extract + the last valid state when past a batch element's sequence length for + correctness. If not provided, the paddings would be treated same as + non-padding inputs. Default None. + **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`. + Returns: + tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \ + is produced by merge outputs of forward and backward RNN according \ + to `merge_mode`; similarly, `final_states` is produced by merge \ + `final_states` of forward and backward RNN. + """ + if isinstance(initial_states, (list, tuple)): + assert len( + initial_states + ) == 2, "length of initial_states should be 2 when it is a list/tuple" + else: + initial_states = [initial_states, initial_states] + outputs_fw, states_fw = self.rnn_fw(inputs, initial_states[0], + sequence_length, **kwargs) + outputs_bw, states_bw = self.rnn_bw(inputs, initial_states[1], + sequence_length, **kwargs) + outputs = map_structure(self.merge_func, outputs_fw, + outputs_bw) if self.merge_func else (outputs_fw, + outputs_bw) + final_states = map_structure( + self.merge_func, states_fw, + states_bw) if self.merge_func else (states_fw, states_bw) + return outputs, final_states + + @staticmethod + def bidirect_param_attr(param_attr): + """ + Converts `param_attr` to a pair of `param_attr` when it is not a list + or tuple with length 2, also rename every one by appending a suffix to + avoid having same names when `param_attr` contains a name. + + Parameters: + param_attr (list|tuple|ParamAttr): A list, tuple or something can be + converted to a ParamAttr instance by `ParamAttr._to_attr`. When + it is a list or tuple, its length must be 2. + + Returns: + list: A pair composed of forward and backward RNN cell's `param_attr`. + """ + if isinstance(param_attr, (list, tuple)): + assert len( + param_attr + ) == 2, "length of param_attr should be 2 when it is a list/tuple" + param_attrs = param_attr + else: + param_attrs = [] + attr = ParamAttr._to_attr(param_attr) + attr_fw = copy.deepcopy(attr) + if attr.name: + attr_fw.name = attr_fw.name + "_fw" + param_attrs.append(attr_fw) + attr_bw = copy.deepcopy(attr) + if attr.name: + attr_bw.name = attr_bw.name + "_bw" + param_attrs.append(attr_bw) + return param_attrs + + +class LSTM(Layer): + """ + Applies a stacked multi-layer long short-term memory (LSTM) RNN to an input + sequence. + + The formula for LSTM used here is as follows: + + .. math:: + i_{t} & = \sigma(W_{x_{i}}x_{t} + b_{x_{i}} + W_{h_{i}}h_{t-1} + b_{h_{i}}) + f_{t} & = \sigma(W_{x_{f}}x_{t} + b_{x_{f}} + W_{h_{f}}h_{t-1} + b_{h_{f}}) + o_{t} & = \sigma(W_{x_{o}}x_{t} + b_{x_{o}} + W_{h_{o}}h_{t-1} + b_{h_{o}}) + c_{t} & = f_{t}c_{t-1} + i_{t} \\tanh (W_{x_{c}}x_{t} + b_{x_{c}} + W_{h_{c}}h_{t-1} + b_{h_{c}}) + h_{t} & = o_{t} \\tanh (c_{t}) + + Parameters: + input_size (int): The input feature size for the first LSTM. + hidden_size (int): The hidden size for every LSTM. + num_layers(int, optional): The number of LSTM to be stacked. Default 1. + dropout(float, optional): The dropout probability applied on the outputs + of each LSTM except the last one. 0 for not dropout. Default 0.0 + direction (str, optional): Indicate the direction for LSTM calculation + applying on the input sequences. It can be `forward`, `backward` or + `bidirect`. If it is `backward`, calculate in the reverse order of + input sequences. If it is `bidirect`, each layer would be a + bidirectional LSTM composed of a `forward` LSTM and `backward` LSTM, + and it concatenates their outputs as outputs. Default: `forward`. + time_major (bool, optional): Indicate the data layout of Tensor included + in `input` and `output` tensors. If `False`, the data layout would + be batch major with shape `[batch_size, sequence_length, ...]`. If + `True`, the data layout would be time major with shape + `[sequence_length, batch_size, ...]`. Default: `False`. + param_attr (list|tuple|ParamAttr): A list, tuple or something can be + converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is + a list or tuple, it's length must equal to `num_layers`. Otherwise, + construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`. + Default None. + bias_attr (list|tuple|ParamAttr): A list, tuple or something can be + converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is + a list or tuple, it's length must equal to `num_layers`. Otherwise, + construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`. + Default None. + dtype(string, optional): The data type used in this cell. It can be + float32 or float64. Default float32. + Examples: + .. code-block:: python + import paddle + import paddle.fluid as fluid + from paddle.incubate.hapi.text import LSTM + inputs = paddle.rand((2, 4, 32)) + lstm = LSTM(input_size=32, hidden_size=64, num_layers=2) + outputs, _ = lstm(inputs) # [2, 4, 64] + """ + + def __init__(self, + input_size, + hidden_size, + num_layers=1, + dropout=0.0, + direction="forward", + time_major=False, + param_attr=None, + bias_attr=None, + dtype='float32'): + super(LSTM, self).__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.num_layers = num_layers + self.dropout = dropout + self.direction = direction + self.num_directions = 2 if direction == 'bidirect' else 1 + self.time_major = time_major + + if direction == 'bidirect': + param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr) + bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr) + fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0], + num_layers) + bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1], + num_layers) + fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0], + num_layers) + bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1], + num_layers) + + # maybe design cell including both forward and backward later + merge_mode = 'concat' + rnns = [] + for i in range(num_layers): + cell_fw = StackedLSTMCell(input_size if i == 0 else ( + hidden_size * 2 if merge_mode == 'concat' else + hidden_size), hidden_size, 1, dropout, fw_param_attrs[i], + fw_bias_attrs[i], dtype) + cell_bw = StackedLSTMCell(input_size if i == 0 else ( + hidden_size * 2 if merge_mode == 'concat' else + hidden_size), hidden_size, 1, dropout, bw_param_attrs[i], + bw_bias_attrs[i], dtype) + rnns.append( + BidirectionalRNN( + cell_fw, + cell_bw, + merge_mode=merge_mode, + time_major=time_major)) + self.lstm = LayerList(rnns) + else: + lstm_cell = StackedLSTMCell(input_size, hidden_size, num_layers, + dropout, param_attr, bias_attr, dtype) + self.lstm = RNN(lstm_cell, + is_reverse=(direction == "backward"), + time_major=time_major) + + def forward(self, input, initial_states=None, sequence_length=None): + """ + Performs the stacked multi-layer LSTM layer by layer. Each LSTM's `outputs` + is the `inputs` of the subsequent one. + Parameters: + inputs (Variable): The inputs for the first LSTM. It is a float32 + or float64 tensor shaped `[batch_size, sequence_length, input_size]`. + initial_states (list|None, optional): A list containing initial states + of all stacked LSTM, and the initial states of each LSTM is a pair + of tensors shaped `[batch_size, hidden_size]`. If not provided, + use 0 as initial states. Default None. + sequence_length (Variable, optional): A tensor with shape `[batch_size]`. + It stores real length of each instance, thus enables users to extract + the last valid state when past a batch element's sequence length for + correctness. If not provided, the paddings would be treated same as + non-padding inputs. Default None. + Returns: + tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \ + is the output of last LSTM and it is a tensor with shape \ + `[batch_size, sequence_length, hidden_size]` and has the same \ + data type as `inputs`, `final_states` is the counterpart of \ + `initial_states` at last time step, thus has the same structure \ + with it and has tensors with same shapes data types. + """ + if not isinstance(self.lstm, LayerList): + return self.lstm(input, initial_states, sequence_length) + else: + if isinstance(initial_states, (list, tuple)): + assert len(initial_states) == self.num_layers, ( + "length of initial_states should be %d when it is a list|tuple" + % self.num_layers) + else: + initial_states = [initial_states] * self.num_layers + stacked_states = [] + for i in range(self.num_layers): + output, states = self.lstm[i](input, initial_states[i], + sequence_length) + input = output + stacked_states.append(states) + return output, stacked_states From 47688544de190e1ca667cae60bad7b8da9d8d241 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Sat, 15 Aug 2020 15:33:36 +0800 Subject: [PATCH 02/14] new rnn api, cell almost done --- python/paddle/nn/__init__.py | 3 + python/paddle/nn/layer/__init__.py | 4 +- python/paddle/nn/layer/rnn.py | 1465 ++++++++++++++-------------- 3 files changed, 728 insertions(+), 744 deletions(-) diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py index 98948fa91e2e8..cd73ec1336a2b 100644 --- a/python/paddle/nn/__init__.py +++ b/python/paddle/nn/__init__.py @@ -18,6 +18,7 @@ from .layer import norm from .functional import extension from .layer import common +from .layer import rnn from . import initializer @@ -25,6 +26,7 @@ __all__ += norm.__all__ __all__ += extension.__all__ __all__ += common.__all__ +__all__ += rnn.__all__ # TODO: define alias in nn directory # from .clip import ErrorClipByValue #DEFINE_ALIAS @@ -90,6 +92,7 @@ from .layer.norm import LayerNorm #DEFINE_ALIAS from .layer.norm import SpectralNorm #DEFINE_ALIAS from .layer.norm import InstanceNorm #DEFINE_ALIAS +from .layer.rnn import * # from .layer.rnn import RNNCell #DEFINE_ALIAS # from .layer.rnn import GRUCell #DEFINE_ALIAS # from .layer.rnn import LSTMCell #DEFINE_ALIAS diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py index fbbcf048f2987..0d8687e91e29e 100644 --- a/python/paddle/nn/layer/__init__.py +++ b/python/paddle/nn/layer/__init__.py @@ -20,6 +20,7 @@ from . import extension from . import activation from . import norm +from . import rnn from .activation import * from .loss import * @@ -67,6 +68,3 @@ from .norm import LayerNorm #DEFINE_ALIAS from .norm import SpectralNorm #DEFINE_ALIAS from .norm import InstanceNorm #DEFINE_ALIAS -# from .rnn import RNNCell #DEFINE_ALIAS -# from .rnn import GRUCell #DEFINE_ALIAS -# from .rnn import LSTMCell #DEFINE_ALIAS diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py index b4ce7678cb747..b64f04b37879c 100644 --- a/python/paddle/nn/layer/rnn.py +++ b/python/paddle/nn/layer/rnn.py @@ -12,39 +12,40 @@ # See the License for the specific language governing permissions and # limitations under the License. -# TODO: define classes of recurrent neural network - -__all__ = [ - 'RNNCellBase', - 'LSTMCell', - 'GRUCell', - 'StackedRNNCell', - 'StackedLSTMCell', - 'stackedGRUCell', - 'RNN', - 'BidirectionalRNN', - 'LSTM', - 'GRU', -] - import copy import collections import itertools import six +import math import sys import warnings from functools import partial, reduce -import numpy as np - from ... import fluid from ...fluid import layers +from ...fluid import initializer as I from ...fluid.data_feeder import convert_dtype from ...fluid.dygraph import Layer, LayerList from ...fluid.param_attr import ParamAttr from ...fluid.layers import utils, BeamSearchDecoder from ...fluid.layers.utils import map_structure, flatten, pack_sequence_as +# TODO: define classes of recurrent neural network + +__all__ = [ + 'RNNCellBase', + 'SimpleRNNCell', + 'LSTMCell', + 'GRUCell', + 'StackedRNNCell', + 'StackedLSTMCell', + # 'stackedGRUCell', + 'RNN', + 'BidirectionalRNN', + 'LSTM', + # 'GRU', +] + class RNNCellBase(Layer): """ @@ -163,6 +164,53 @@ def state_dtype(self): "Please add implementaion for `state_dtype` in the used cell.") +class SimpleRNNCell(RNNCellBase): + def __init__(self, input_size, hidden_size, nonlinearity="tanh", name=None): + super(SimpleRNNCell, self).__init__() + std = 1.0 / math.sqrt(hidden_size) + self.weight_ih = self.create_parameter( + (hidden_size, input_size), default_initializer=I.Uniform(-std, std)) + self.weight_hh = self.create_parameter( + (hidden_size, hidden_size), + default_initializer=I.Uniform(-std, std)) + self.bias_ih = self.create_parameter( + (hidden_size, ), + is_bias=True, + default_initializer=I.Uniform(-std, std)) + self.bias_hh = self.create_parameter( + (hidden_size, ), + is_bias=True, + default_initializer=I.Uniform(-std, std)) + + self.input_size = input_size + self.hidden_size = hidden_size + if nonlinearity not in ["tanh", "relu"]: + raise ValueError( + "nonlinearity for SimpleRNNCell should be tanh or relu, " + "but get {}".format(nonlinearity)) + self.nonlinearity = nonlinearity + self._nonlinear_fn = layers.tanh \ + if nonlinearity == "tanh" \ + else layers.relu + + def forward(self, inputs, states=None): + if states is None: + states = self.get_initial_states(inputs, self.state_shape) + pre_h = states + i2h = layers.matmul(inputs, self.weight_ih, transpose_y=True) + if self.bias_ih is not None: + i2h += self.bias_ih + h2h = layers.matmul(pre_h, self.weight_hh, transpose_y=True) + if self.bias_hh is not None: + h2h += self.bias_hh + h = self._nonlinear_fn(i2h + h2h) + return h, h + + @property + def state_shape(self): + return (self.hidden_size, ) + + class LSTMCell(RNNCellBase): """ Long-Short Term Memory(LSTM) RNN cell. @@ -197,69 +245,37 @@ class LSTMCell(RNNCellBase): outputs, _ = rnn(inputs) # [2, 4, 64] """ - def __init__(self, - input_size, - hidden_size, - param_attr=None, - bias_attr=None, - dtype="float32"): - super(LSTMCell, self).__init__(dtype) - - self.hidden_size = hidden_size - self.input_size = input_size - self._gate_activation = getattr(layers, "sigmoid") - self._activation = getattr(layers, "tanh") - self._param_attr = ParamAttr._to_attr(param_attr) - self._bias_attr = ParamAttr._to_attr(bias_attr) - self._dtype = dtype - - if self._param_attr and self._param_attr.name is not None: - weight_ih_param_attr = copy.deepcopy(self._param_attr) - weight_hh_param_attr = copy.deepcopy(self._param_attr) - weight_ih_param_attr.name += "_weight_ih" - weight_hh_param_attr.name += "_weight_hh" - else: - weight_ih_param_attr = self._param_attr - weight_hh_param_attr = self._param_attr - - if self._bias_attr and self._bias_attr.name is not None: - bias_ih_param_attr = copy.deepcopy(self._bias_attr) - bias_hh_param_attr = copy.deepcopy(self._bias_attr) - bias_ih_param_attr.name += "_bias_ih" - bias_hh_param_attr.name += "_bias_hh" - else: - bias_ih_param_attr = self._bias_attr - bias_hh_param_attr = self._bias_attr - + def __init__(self, input_size, hidden_size, name=None): + super(LSTMCell, self).__init__() + std = 1.0 / math.sqrt(hidden_size) self.weight_ih = self.create_parameter( - attr=weight_ih_param_attr, - shape=[4 * hidden_size, input_size], - dtype=dtype) - + (4 * hidden_size, input_size), + default_initializer=I.Uniform(-std, std)) self.weight_hh = self.create_parameter( - attr=weight_hh_param_attr, - shape=[4 * hidden_size, hidden_size], - dtype=dtype) - + (4 * hidden_size, hidden_size), + default_initializer=I.Uniform(-std, std)) self.bias_ih = self.create_parameter( - attr=bias_ih_param_attr, - shape=[4 * hidden_size], - dtype=dtype, - is_bias=True) + (4 * hidden_size, ), + is_bias=True, + default_initializer=I.Uniform(-std, std)) self.bias_hh = self.create_parameter( - attr=bias_hh_param_attr, - shape=[4 * hidden_size], - dtype=dtype, - is_bias=True) + (4 * hidden_size, ), + is_bias=True, + default_initializer=I.Uniform(-std, std)) - def forward(self, inputs, states): + self.hidden_size = hidden_size + self.input_size = input_size + self._gate_activation = layers.sigmoid + self._activation = layers.tanh + + def forward(self, inputs, states=None): """ Performs single step LSTM calculations. Parameters: inputs (Variable): A tensor with shape `[batch_size, input_size]`, corresponding to :math:`x_t` in the formula. The data type should be float32 or float64. - states (Variable): A list of containing two tensors, each shaped + states (Variable): A tuple of two tensors, each shaped `[batch_size, hidden_size]`, corresponding to :math:`h_{t-1}, c_{t-1}` in the formula. The data type should be float32 or float64. Returns: @@ -270,15 +286,17 @@ def forward(self, inputs, states): to :math:`h_{t}, c_{t}` in the formula. The data type of these \ tensors all is same as that of `states`. """ + if states is None: + states = self.get_initial_states(inputs, self.state_shape) pre_hidden, pre_cell = states gates = layers.matmul(inputs, self.weight_ih, transpose_y=True) - if self.bias_ih: + if self.bias_ih is not None: gates = gates + self.bias_ih gates += layers.matmul(pre_hidden, self.weight_hh, transpose_y=True) - if self.bias_hh: + if self.bias_hh is not None: gates = gates + self.bias_hh - chunked_gates = layers.split(gates, num_or_sections=4, dim=1) + chunked_gates = layers.split(gates, num_or_sections=4, dim=-1) i = self._gate_activation(chunked_gates[0]) f = self._gate_activation(chunked_gates[1]) @@ -286,7 +304,7 @@ def forward(self, inputs, states): c = f * pre_cell + i * self._activation(chunked_gates[2]) h = o * self._activation(c) - return h, [h, c] + return h, (h, c) @property def state_shape(self): @@ -295,7 +313,7 @@ def state_shape(self): (-1 for batch size would be automatically inserted into shape). These two shapes correspond to :math:`h_{t-1}` and :math:`c_{t-1}` separately. """ - return [[self.hidden_size], [self.hidden_size]] + return ((self.hidden_size, ), (self.hidden_size, )) class GRUCell(RNNCellBase): @@ -337,62 +355,30 @@ class GRUCell(RNNCellBase): outputs, _ = rnn(inputs) # [2, 4, 64] """ - def __init__(self, - input_size, - hidden_size, - param_attr=None, - bias_attr=None, - dtype='float32'): + def __init__(self, input_size, hidden_size, name=None): super(GRUCell, self).__init__() - - self.hidden_size = hidden_size - self.input_size = input_size - self._gate_activation = getattr(layers, "sigmoid") - self._activation = getattr(layers, "tanh") - self._param_attr = ParamAttr._to_attr(param_attr) - self._bias_attr = ParamAttr._to_attr(bias_attr) - self._dtype = dtype - - if self._param_attr and self._param_attr.name is not None: - weight_ih_param_attr = copy.deepcopy(self._param_attr) - weight_hh_param_attr = copy.deepcopy(self._param_attr) - weight_ih_param_attr.name += "_weight_ih" - weight_hh_param_attr.name += "_weight_hh" - else: - weight_ih_param_attr = self._param_attr - weight_hh_param_attr = self._param_attr - - if self._bias_attr and self._bias_attr.name is not None: - bias_ih_param_attr = copy.deepcopy(self._bias_attr) - bias_hh_param_attr = copy.deepcopy(self._bias_attr) - bias_ih_param_attr.name += "_bias_ih" - bias_hh_param_attr.name += "_bias_hh" - else: - bias_ih_param_attr = self._bias_attr - bias_hh_param_attr = self._bias_attr - + std = 1.0 / math.sqrt(hidden_size) self.weight_ih = self.create_parameter( - attr=weight_ih_param_attr, - shape=[3 * hidden_size, input_size], - dtype=dtype) - + (3 * hidden_size, input_size), + default_initializer=I.Uniform(-std, std)) self.weight_hh = self.create_parameter( - attr=weight_hh_param_attr, - shape=[3 * hidden_size, hidden_size], - dtype=dtype) - + (3 * hidden_size, hidden_size), + default_initializer=I.Uniform(-std, std)) self.bias_ih = self.create_parameter( - attr=bias_ih_param_attr, - shape=[3 * hidden_size], - dtype=dtype, - is_bias=True) + (3 * hidden_size, ), + is_bias=True, + default_initializer=I.Uniform(-std, std)) self.bias_hh = self.create_parameter( - attr=bias_hh_param_attr, - shape=[3 * hidden_size], - dtype=dtype, - is_bias=True) + (3 * hidden_size, ), + is_bias=True, + default_initializer=I.Uniform(-std, std)) - def forward(self, inputs, states): + self.hidden_size = hidden_size + self.input_size = input_size + self._gate_activation = layers.sigmoid + self._activation = layers.tanh + + def forward(self, inputs, states=None): """ Performs single step GRU calculations. @@ -410,23 +396,24 @@ def forward(self, inputs, states): corresponding to :math:`h_t` in the formula. The data type of the \ tensor is same as that of `states`. """ - pre_hidden = states + if states is None: + states = self.get_initial_states(inputs, self.state_shape) + pre_hidden = states x_gates = layers.matmul(inputs, self.weight_ih, transpose_y=True) - if self.bias_ih: + if self.bias_ih is not None: x_gates = x_gates + self.bias_ih h_gates = layers.matmul(pre_hidden, self.weight_hh, transpose_y=True) - if self.bias_hh: + if self.bias_hh is not None: h_gates = h_gates + self.bias_hh - x_u, x_r, x_c = layers.split(x_gates, num_or_sections=3, dim=1) - h_u, h_r, h_c = layers.split(x_gates, num_or_sections=3, dim=1) + x_r, x_z, x_c = layers.split(x_gates, num_or_sections=3, dim=1) + h_r, h_z, h_c = layers.split(h_gates, num_or_sections=3, dim=1) - u = self._gate_activation(x_u + h_u) r = self._gate_activation(x_r + h_r) - h_c = r * h_c - c = self._activation(x_c + h_c) - h = u * pre_hidden + (1 - u) * c + z = self._gate_activation(x_z + h_z) + c = self._activation(x_c + r * h_c) # apply reset gate after mm + h = (pre_hidden - c) * z + c return h, h @@ -437,235 +424,330 @@ def state_shape(self): size would be automatically inserted into shape). The shape corresponds to :math:`h_{t-1}`. """ - return [self._hidden_size] + return (self.hidden_size, ) -class StackedRNNCell(RNNCellBase): +class RNN(Layer): """ - Wrapper allowing a stack of RNN cells to behave as a single cell. It is used - to implement stacked RNNs. + RNN creates a recurrent neural network specified by RNNCell `cell`, which + performs :code:`cell.forward()` repeatedly until reaches to the maximum + length of `inputs`. Parameters: - cells (list|tuple): List of RNN cell instances. + cell(RNNCell): An instance of `RNNCell`. + is_reverse (bool, optional): Indicate whether to calculate in the reverse + order of input sequences. Default: `False`. + time_major (bool, optional): Indicate the data layout of Tensor included + in `input` and `output` tensors. If `False`, the data layout would + be batch major with shape `[batch_size, sequence_length, ...]`. If + `True`, the data layout would be time major with shape + `[sequence_length, batch_size, ...]`. Default: `False`. Examples: .. code-block:: python - from paddle import LSTMCell, StackedRNNCell - cells = [LSTMCell(32, 32), LSTMCell(32, 32)] - stack_rnn = StackedRNNCell(cells) + import paddle + inputs = paddle.rand((2, 4, 32)) + cell = paddle.StackedLSTMCell(input_size=32, hidden_size=64) + rnn = paddle.RNN(cell=cell) + outputs, _ = rnn(inputs) # [2, 4, 64] """ - def __init__(self, cells): - super(StackedRNNCell, self).__init__() - self.cells = [] - for i, cell in enumerate(cells): - self.cells.append(self.add_sublayer("cell_%d" % i, cell)) + def __init__(self, cell, is_reverse=False, time_major=False): + super(RNN, self).__init__() + self.cell = cell + if not hasattr(self.cell, "call"): + # for non-dygraph mode, `rnn` api uses cell.call + self.cell.call = self.cell.forward + self.is_reverse = is_reverse + self.time_major = time_major + self.batch_index, self.time_step_index = (1, 0) \ + if time_major else (0, 1) - def forward(self, inputs, states, **kwargs): + def forward(self, inputs, initial_states=None, sequence_length=None): """ - Performs :code:`cell.forward` for all including cells sequentially. - Each cell's `inputs` is the `outputs` of the previous cell. And each - cell's `states` is the corresponding one in `states`. - + Performs :code:`cell.forward()` repeatedly until reaches to the maximum + length of `inputs`. Parameters: - inputs (Variable): The inputs for the first cell. Mostly it is a - float32 or float64 tensor with shape `[batch_size, input_size]`. - states (list): A list containing states for all cells orderly. - + inputs (Variable): A (possibly nested structure of) tensor variable[s]. + The shape of tensor should be `[batch_size, sequence_length, ...]` + for `time_major == False` or `[sequence_length, batch_size, ...]` + for `time_major == True`. It represents the inputs to be unrolled + in RNN. + initial_states (Variable, optional): A (possibly nested structure of) + tensor variable[s], representing the initial state for RNN. + If not provided, `cell.get_initial_states` would be used to produce + the initial state. Default None. + sequence_length (Variable, optional): A tensor with shape `[batch_size]`. + It stores real length of each instance, thus enables users to extract + the last valid state when past a batch element's sequence length for + correctness. If not provided, the paddings would be treated same as + non-padding inputs. Default None. + **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`. Returns: - tuple: A tuple( :code:`(outputs, new_states)` ). `outputs` is the \ - `outputs` of the last cell. `new_states` is a list composed \ - of all cells' `new_states`, and its structure and data type is \ - same as that of `states` argument. + tuple: A tuple( :code:`(final_outputs, final_states)` ) including the final \ + outputs and states, both are Tensor or nested structure of Tensor. \ + `final_outputs` has the same structure and data types as \ + the returned `outputs` of :code:`cell.forward` , and each Tenser in `final_outputs` \ + stacks all time steps' counterpart in `outputs` thus has shape `[batch_size, sequence_length, ...]` \ + for `time_major == False` or `[sequence_length, batch_size, ...]` for `time_major == True`. \ + `final_states` is the counterpart at last time step of initial states, \ + thus has the same structure with it and has tensors with same shapes \ + and data types. """ - new_states = [] - for cell, state in zip(self.cells, states): - outputs, new_state = cell(inputs, state, **kwargs) - inputs = outputs - new_states.append(new_state) - return outputs, new_states + flat_inputs = flatten(inputs) + batch_size, time_steps = (flat_inputs[0].shape[self.batch_index], + flat_inputs[0].shape[self.time_step_index]) - @staticmethod - def stack_param_attr(param_attr, n): - """ - If `param_attr` is a list or tuple, convert every element in it to a - ParamAttr instance. Otherwise, repeat `param_attr` `n` times to - construct a list, and rename every one by appending a increasing index - suffix to avoid having same names when `param_attr` contains a name. - Parameters: - param_attr (list|tuple|ParamAttr): A list, tuple or something can be - converted to a ParamAttr instance by `ParamAttr._to_attr`. - n (int): The times to repeat to construct a list when `param_attr` - is not a list or tuple. - Returns: - list: A list composed of each including cell's `param_attr`. - """ - if isinstance(param_attr, (list, tuple)): - assert len(param_attr) == n, ( - "length of param_attr should be %d when it is a list/tuple" % n) - param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr] - else: - param_attrs = [] - attr = ParamAttr._to_attr(param_attr) - for i in range(n): - attr_i = copy.deepcopy(attr) - if attr.name: - attr_i.name = attr_i.name + "_" + str(i) - param_attrs.append(attr_i) - return param_attrs + if initial_states is None: + initial_states = self.cell.get_initial_states( + batch_ref=inputs, + dtype=inputs.dtype, + batch_dim_idx=self.batch_index) - @property - def state_shape(self): - """ - The `state_shape` of StackedRNNCell is a list composed of each including - cell's `state_shape`. - Returns: - list: A list composed of each including cell's `state_shape`. - """ - return [cell.state_shape for cell in self.cells] + if fluid.in_dygraph_mode(): + class ArrayWrapper(object): + def __init__(self, x): + self.array = [x] -class StackedLSTMCell(RNNCellBase): - """ - Wrapper allowing a stack of LSTM cells to behave as a single cell. It is used - to implement stacked LSTM. + def append(self, x): + self.array.append(x) + return self - The formula for LSTM used here is as follows: + def _maybe_copy(state, new_state, step_mask): + # TODO: use where_op + new_state = layers.elementwise_mul( + new_state, step_mask, axis=0) - layers.elementwise_mul( + state, (step_mask - 1), axis=0) + return new_state - .. math:: - i_{t} & = \sigma(W_{x_{i}}x_{t} + b_{x_{i}} + W_{h_{i}}h_{t-1} + b_{h_{i}}) - f_{t} & = \sigma(W_{x_{f}}x_{t} + b_{x_{f}} + W_{h_{f}}h_{t-1} + b_{h_{f}}) - o_{t} & = \sigma(W_{x_{o}}x_{t} + b_{x_{o}} + W_{h_{o}}h_{t-1} + b_{h_{o}}) - c_{t} & = f_{t}c_{t-1} + i_{t} \\tanh (W_{x_{c}}x_{t} + b_{x_{c}} + W_{h_{c}}h_{t-1} + b_{h_{c}}) - h_{t} & = o_{t} \\tanh (c_{t}) + if not self.time_major: + inputs = map_structure( + lambda x: layers.transpose(x, [1, 0] + list( + range(2, len(x.shape)))), inputs) - Parameters: - input_size (int): The input size for the first LSTM cell. - hidden_size (int): The hidden size for every LSTM cell. - num_layers(int, optional): The number of LSTM to be stacked. Default 1. - dropout(float, optional): The dropout probability applied on the outputs - of each LSTM cell except the last one. 0 for no dropout. Default 0.0 - param_attr (list|tuple|ParamAttr): A list, tuple or something can be - converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is - a list or tuple, it's length must equal to `num_layers`. Otherwise, - construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`. - Default None. - bias_attr (list|tuple|ParamAttr): A list, tuple or something can be - converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is - a list or tuple, it's length must equal to `num_layers`. Otherwise, - construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`. - Default None. - dtype(string, optional): The data type used in this cell. It can be - float32 or float64. Default float32. + if sequence_length is not None: + mask = layers.sequence_mask( + sequence_length, maxlen=time_steps, dtype=inputs.dtype) + mask = layers.transpose(mask, [1, 0]) - Examples: - .. code-block:: python - import paddle - inputs = paddle.rand((2, 4, 32)) - cell = paddle.StackedLSTMCell(input_size=32, hidden_size=64) - rnn = paddle.RNN(cell=cell) - outputs, _ = rnn(inputs) # [2, 4, 64] - """ + if self.is_reverse: + inputs = map_structure(lambda x: layers.reverse(x, axis=[0]), + inputs) + mask = layers.reverse( + mask, axis=[0]) if sequence_length is not None else None - def __init__(self, - input_size, - hidden_size, - num_layers=1, - dropout=0.0, - param_attr=None, - bias_attr=None, - dtype="float32"): - super(StackedLSTMCell, self).__init__() - self.hidden_size = hidden_size - self.input_size = input_size - self.num_layers = num_layers - self.dropout = dropout - param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers) - bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers) + states = initial_states + outputs = [] + for i in range(time_steps): + step_inputs = map_structure(lambda x: x[i], inputs) + step_outputs, new_states = self.cell(step_inputs, states) + if sequence_length is not None: + new_states = map_structure( + partial( + _maybe_copy, step_mask=mask[i]), + states, + new_states) + states = new_states + outputs = map_structure( + lambda x: ArrayWrapper(x), + step_outputs) if i == 0 else map_structure( + lambda x, x_array: x_array.append(x), step_outputs, + outputs) - self.cells = [] - for i in range(num_layers): - self.cells.append( - self.add_sublayer( - "lstm_%d" % i, - LSTMCell( - input_size=input_size if i == 0 else hidden_size, - hidden_size=hidden_size, - param_attr=param_attrs[i], - bias_attr=bias_attrs[i], - dtype=dtype))) + final_outputs = map_structure( + lambda x: layers.stack(x.array, axis=self.time_step_index), + outputs) - def forward(self, inputs, states): - """ - Performs the stacked LSTM cells sequentially. Each cell's `inputs` is - the `outputs` of the previous cell. And each cell's `states` is the - corresponding one in `states`. + if self.is_reverse: + final_outputs = map_structure( + lambda x: layers.reverse(x, axis=self.time_step_index), + final_outputs) - Parameters: - inputs (Variable): The inputs for the first cell. It is a float32 or - float64 tensor with shape `[batch_size, input_size]`. - states (list): A list containing states for all cells orderly. + final_states = new_states + else: + final_outputs, final_states = layers.rnn( + self.cell, + inputs, + initial_states=initial_states, + sequence_length=sequence_length, + time_major=self.time_major, + is_reverse=self.is_reverse) + return final_outputs, final_states - Returns: - tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \ - a tensor with shape `[batch_size, hidden_size]`, corresponding \ - to :math:`h_{t}` in the formula of the last LSTM; `new_states` \ - is a list composed of every LSTM `new_states` which is a pair \ - of tensors standing for :math:`h_{t}, c_{t}` in the formula, \ - and the data type and structure of these tensors all is same \ - as that of `states`. - """ - new_states = [] - for i, cell in enumerate(self.cells): - outputs, new_state = cell(inputs, states[i]) - outputs = layers.dropout( - outputs, - self.dropout, - dropout_implementation='upscale_in_train' - ) if self.dropout and i != (self.num_layers - 1) else outputs - inputs = outputs - new_states.append(new_state) - # TODO(guosheng): maybe should stack list of states as one tensor - return outputs, new_states - @property - def state_shape(self): +class BidirectionalRNN(Layer): + """ + Wrapper for bidirectional RNN. It assembles two RNNCell instances to perform + forward and backward RNN separately, and merge outputs of these two RNN + according to `merge_mode`. + Parameters: + cell_fw (RNNCell): A RNNCell instance used for forward RNN. + cell_bw (RNNCell): A RNNCell instance used for backward RNN. + merge_mode (str|None, optional): The way to merget outputs of forward and + backward RNN. It can be `concat`, `sum`, `ave`, `mul`, `zip` and None, + where None stands for make the two `outputs` as a tuple, `zip` stands + for make each two corresponding tensors of the two `outputs` as a tuple. + Default `concat` + Examples: + .. code-block:: python + import paddle + from paddle.incubate.hapi.text import StackedLSTMCell, BidirectionalRNN + inputs = paddle.rand((2, 4, 32)) + cell_fw = StackedLSTMCell(32, 64) + cell_bw = StackedLSTMCell(32, 64) + bi_rnn = BidirectionalRNN(cell_fw, cell_bw) + outputs, _ = bi_rnn(inputs) # [2, 4, 128] + """ + + def __init__(self, + cell_fw, + cell_bw, + merge_mode='concat', + time_major=False, + cell_cls=None, + **kwargs): + super(BidirectionalRNN, self).__init__() + self.rnn_fw = RNN(cell_fw, is_reverse=False, time_major=time_major) + self.rnn_bw = RNN(cell_bw, is_reverse=True, time_major=time_major) + if merge_mode == 'concat': + self.merge_func = lambda x, y: layers.concat([x, y], -1) + elif merge_mode == 'sum': + self.merge_func = lambda x, y: layers.elementwise_add(x, y) + elif merge_mode == 'ave': + self.merge_func = lambda x, y: layers.scale( + layers.elementwise_add(x, y), 0.5) + elif merge_mode == 'mul': + self.merge_func = lambda x, y: layers.elementwise_mul(x, y) + elif merge_mode == 'zip': + self.merge_func = lambda x, y: (x, y) + elif merge_mode is None: + self.merge_func = None + else: + raise ValueError('Unsupported value for `merge_mode`: %s' % + merge_mode) + + def forward(self, + inputs, + initial_states=None, + sequence_length=None, + **kwargs): """ - The `state_shape` of StackedLSTMCell is a list composed of each including - LSTM cell's `state_shape`. + Performs forward and backward RNN separately, and merge outputs of these + two RNN according to `merge_mode`. + Parameters: + inputs (Variable): A (possibly nested structure of) tensor variable[s]. + The shape of tensor should be `[batch_size, sequence_length, ...]` + for `time_major == False` or `[sequence_length, batch_size, ...]` + for `time_major == True`. It represents the inputs to be unrolled + in both forward and backward RNN. + initial_states (Variable|list|tuple): If it is a list or tuple, its + length should be 2 to include initial states of forward and backward + RNN separately. Otherwise it would be used twice for the two RNN. + If None, `cell.get_initial_states` would be used to produce the initial + states. Default None. + sequence_length (Variable, optional): A tensor with shape `[batch_size]`. + It stores real length of each instance, thus enables users to extract + the last valid state when past a batch element's sequence length for + correctness. If not provided, the paddings would be treated same as + non-padding inputs. Default None. + **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`. Returns: - list: A list composed of each including LSTM cell's `state_shape`. + tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \ + is produced by merge outputs of forward and backward RNN according \ + to `merge_mode`; similarly, `final_states` is produced by merge \ + `final_states` of forward and backward RNN. """ - return [cell.state_shape for cell in self.cells] + if isinstance(initial_states, (list, tuple)): + assert len( + initial_states + ) == 2, "length of initial_states should be 2 when it is a list/tuple" + else: + initial_states = [initial_states, initial_states] + outputs_fw, states_fw = self.rnn_fw(inputs, initial_states[0], + sequence_length, **kwargs) + outputs_bw, states_bw = self.rnn_bw(inputs, initial_states[1], + sequence_length, **kwargs) + outputs = map_structure(self.merge_func, outputs_fw, + outputs_bw) if self.merge_func else (outputs_fw, + outputs_bw) + final_states = map_structure( + self.merge_func, states_fw, + states_bw) if self.merge_func else (states_fw, states_bw) + return outputs, final_states + @staticmethod + def bidirect_param_attr(param_attr): + """ + Converts `param_attr` to a pair of `param_attr` when it is not a list + or tuple with length 2, also rename every one by appending a suffix to + avoid having same names when `param_attr` contains a name. -class StackedGRUCell(RNNCellBase): - """ - Wrapper allowing a stack of GRU cells to behave as a single cell. It is used - to implement stacked GRU. + Parameters: + param_attr (list|tuple|ParamAttr): A list, tuple or something can be + converted to a ParamAttr instance by `ParamAttr._to_attr`. When + it is a list or tuple, its length must be 2. - The formula for GRU used here is as follows: + Returns: + list: A pair composed of forward and backward RNN cell's `param_attr`. + """ + if isinstance(param_attr, (list, tuple)): + assert len( + param_attr + ) == 2, "length of param_attr should be 2 when it is a list/tuple" + param_attrs = param_attr + else: + param_attrs = [] + attr = ParamAttr._to_attr(param_attr) + attr_fw = copy.deepcopy(attr) + if attr.name: + attr_fw.name = attr_fw.name + "_fw" + param_attrs.append(attr_fw) + attr_bw = copy.deepcopy(attr) + if attr.name: + attr_bw.name = attr_bw.name + "_bw" + param_attrs.append(attr_bw) + return param_attrs - .. math:: - u_t & = \sigma(W_{x_{u}}x_{t} + b_{x_{u}} + W_{h_{u}}h_{t-1} + b_{h_{u}}) +class SimpleRNN(Layer): + pass - r_t & = \sigma(W_{x_{r}}x_{t} + b_{x_{r}} + W_{h_{r}}h_{t-1} + b_{h_{r}}) - \\tilde{h_t} & = \\tanh(W_{x_{c}}x_{t} + r_t \odot (W_{h_{c}}h_{t-1} + b_{h_{c}}) +class LSTM(Layer): + """ + Applies a stacked multi-layer long short-term memory (LSTM) RNN to an input + sequence. - h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t} + The formula for LSTM used here is as follows: + .. math:: + i_{t} & = \sigma(W_{x_{i}}x_{t} + b_{x_{i}} + W_{h_{i}}h_{t-1} + b_{h_{i}}) + f_{t} & = \sigma(W_{x_{f}}x_{t} + b_{x_{f}} + W_{h_{f}}h_{t-1} + b_{h_{f}}) + o_{t} & = \sigma(W_{x_{o}}x_{t} + b_{x_{o}} + W_{h_{o}}h_{t-1} + b_{h_{o}}) + c_{t} & = f_{t}c_{t-1} + i_{t} \\tanh (W_{x_{c}}x_{t} + b_{x_{c}} + W_{h_{c}}h_{t-1} + b_{h_{c}}) + h_{t} & = o_{t} \\tanh (c_{t}) Parameters: - input_size (int): The input size for the first GRU cell. - hidden_size (int): The hidden size for every GRU cell. - num_layers(int, optional): The number of GRU to be stacked. Default 1. + input_size (int): The input feature size for the first LSTM. + hidden_size (int): The hidden size for every LSTM. + num_layers(int, optional): The number of LSTM to be stacked. Default 1. dropout(float, optional): The dropout probability applied on the outputs - of each GRU cell except the last one. 0 for no dropout. Default 0.0 - param_attr (list|tuple|ParamAttr): A list, tuple or something can be - converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is - a list or tuple, it's length must equal to `num_layers`. Otherwise, + of each LSTM except the last one. 0 for not dropout. Default 0.0 + direction (str, optional): Indicate the direction for LSTM calculation + applying on the input sequences. It can be `forward`, `backward` or + `bidirect`. If it is `backward`, calculate in the reverse order of + input sequences. If it is `bidirect`, each layer would be a + bidirectional LSTM composed of a `forward` LSTM and `backward` LSTM, + and it concatenates their outputs as outputs. Default: `forward`. + time_major (bool, optional): Indicate the data layout of Tensor included + in `input` and `output` tensors. If `False`, the data layout would + be batch major with shape `[batch_size, sequence_length, ...]`. If + `True`, the data layout would be time major with shape + `[sequence_length, batch_size, ...]`. Default: `False`. + param_attr (list|tuple|ParamAttr): A list, tuple or something can be + converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is + a list or tuple, it's length must equal to `num_layers`. Otherwise, construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`. Default None. bias_attr (list|tuple|ParamAttr): A list, tuple or something can be @@ -675,413 +757,339 @@ class StackedGRUCell(RNNCellBase): Default None. dtype(string, optional): The data type used in this cell. It can be float32 or float64. Default float32. - Examples: - .. code-block:: python - import paddle - + import paddle.fluid as fluid + from paddle.incubate.hapi.text import LSTM inputs = paddle.rand((2, 4, 32)) - cell = paddle.StackedGRUCell(input_size=32, hidden_size=64) - rnn = paddle.RNN(cell=cell) - outputs, _ = rnn(inputs) # [2, 4, 64] + lstm = LSTM(input_size=32, hidden_size=64, num_layers=2) + outputs, _ = lstm(inputs) # [2, 4, 64] """ def __init__(self, input_size, hidden_size, num_layers=1, + direction="forward", dropout=0.0, - param_attr=None, - bias_attr=None, - dtype="float32"): - super(StackedGRUCell, self).__init__() - self.hidden_size = hidden_size + time_major=False, + name=None): + super(LSTM, self).__init__() self.input_size = input_size + self.hidden_size = hidden_size self.num_layers = num_layers self.dropout = dropout - param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers) - bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers) + self.direction = direction + self.num_directions = 2 if direction == 'bidirect' else 1 + self.time_major = time_major - self.cells = [] - for i in range(num_layers): - self.cells.append( - self.add_sublayer( - "gru_%d" % i, - GRUCell( - input_size=input_size if i == 0 else hidden_size, - hidden_size=hidden_size, - param_attr=param_attrs[i], - bias_attr=bias_attrs[i], - dtype=dtype))) + if direction == 'bidirect': + param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr) + bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr) + fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0], + num_layers) + bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1], + num_layers) + fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0], + num_layers) + bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1], + num_layers) - def forward(self, inputs, states): - """ - Performs the stacked GRU cells sequentially. Each cell's `inputs` is - the `outputs` of the previous cell. And each cell's `states` is the - corresponding one in `states`. + # maybe design cell including both forward and backward later + merge_mode = 'concat' + rnns = [] + for i in range(num_layers): + cell_fw = StackedLSTMCell(input_size if i == 0 else ( + hidden_size * 2 if merge_mode == 'concat' else + hidden_size), hidden_size, 1, dropout, fw_param_attrs[i], + fw_bias_attrs[i], dtype) + cell_bw = StackedLSTMCell(input_size if i == 0 else ( + hidden_size * 2 if merge_mode == 'concat' else + hidden_size), hidden_size, 1, dropout, bw_param_attrs[i], + bw_bias_attrs[i], dtype) + rnns.append( + BidirectionalRNN( + cell_fw, + cell_bw, + merge_mode=merge_mode, + time_major=time_major)) + self.lstm = LayerList(rnns) + else: + lstm_cell = StackedLSTMCell(input_size, hidden_size, num_layers, + dropout, param_attr, bias_attr, dtype) + self.lstm = RNN(lstm_cell, + is_reverse=(direction == "backward"), + time_major=time_major) + def forward(self, input, initial_states=None, sequence_length=None): + """ + Performs the stacked multi-layer LSTM layer by layer. Each LSTM's `outputs` + is the `inputs` of the subsequent one. Parameters: - inputs (Variable): The inputs for the first cell. It is a float32 or - float64 tensor with shape `[batch_size, input_size]`. - states (list): A list containing states for all cells orderly. - + inputs (Variable): The inputs for the first LSTM. It is a float32 + or float64 tensor shaped `[batch_size, sequence_length, input_size]`. + initial_states (list|None, optional): A list containing initial states + of all stacked LSTM, and the initial states of each LSTM is a pair + of tensors shaped `[batch_size, hidden_size]`. If not provided, + use 0 as initial states. Default None. + sequence_length (Variable, optional): A tensor with shape `[batch_size]`. + It stores real length of each instance, thus enables users to extract + the last valid state when past a batch element's sequence length for + correctness. If not provided, the paddings would be treated same as + non-padding inputs. Default None. Returns: - tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \ - a tensor with shape `[batch_size, hidden_size]`, corresponding \ - to :math:`h_{t}` in the formula of the last GRU; `new_states` \ - is a list composed of every GRU `new_states` which is also \ - :math:`h_{t}` in the formula, and the data type and structure \ - of these tensors all is same as that of `states`. + tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \ + is the output of last LSTM and it is a tensor with shape \ + `[batch_size, sequence_length, hidden_size]` and has the same \ + data type as `inputs`, `final_states` is the counterpart of \ + `initial_states` at last time step, thus has the same structure \ + with it and has tensors with same shapes data types. """ - new_states = [] - for i, cell in enumerate(self.cells): - outputs, new_state = cell(inputs, states[i]) - outputs = layers.dropout( - outputs, - self.dropout, - dropout_implementation='upscale_in_train' - ) if self.dropout and i != (self.num_layers - 1) else outputs - inputs = outputs - new_states.append(new_state) - return outputs, new_states + if not isinstance(self.lstm, LayerList): + return self.lstm(input, initial_states, sequence_length) + else: + if isinstance(initial_states, (list, tuple)): + assert len(initial_states) == self.num_layers, ( + "length of initial_states should be %d when it is a list|tuple" + % self.num_layers) + else: + initial_states = [initial_states] * self.num_layers + stacked_states = [] + for i in range(self.num_layers): + output, states = self.lstm[i](input, initial_states[i], + sequence_length) + input = output + stacked_states.append(states) + return output, stacked_states - @property - def state_shape(self): - """ - The `state_shape` of StackedGRUCell is a list composed of each including - GRU cell's `state_shape`. - Returns: - list: A list composed of each including GRU cell's `state_shape`. - """ - return [cell.state_shape for cell in self.cells] +class GRU(Layer): + pass -class RNN(Layer): +# TODO: restucture RNN layers +class StackedRNNCell(RNNCellBase): """ - RNN creates a recurrent neural network specified by RNNCell `cell`, which - performs :code:`cell.forward()` repeatedly until reaches to the maximum - length of `inputs`. + Wrapper allowing a stack of RNN cells to behave as a single cell. It is used + to implement stacked RNNs. Parameters: - cell(RNNCell): An instance of `RNNCell`. - is_reverse (bool, optional): Indicate whether to calculate in the reverse - order of input sequences. Default: `False`. - time_major (bool, optional): Indicate the data layout of Tensor included - in `input` and `output` tensors. If `False`, the data layout would - be batch major with shape `[batch_size, sequence_length, ...]`. If - `True`, the data layout would be time major with shape - `[sequence_length, batch_size, ...]`. Default: `False`. + cells (list|tuple): List of RNN cell instances. Examples: .. code-block:: python - import paddle - inputs = paddle.rand((2, 4, 32)) - cell = paddle.StackedLSTMCell(input_size=32, hidden_size=64) - rnn = paddle.RNN(cell=cell) - outputs, _ = rnn(inputs) # [2, 4, 64] + from paddle import LSTMCell, StackedRNNCell + cells = [LSTMCell(32, 32), LSTMCell(32, 32)] + stack_rnn = StackedRNNCell(cells) """ - def __init__(self, cell, is_reverse=False, time_major=False): - super(RNN, self).__init__() - self.cell = cell - if not hasattr(self.cell, "call"): - # for non-dygraph mode, `rnn` api uses cell.call - self.cell.call = self.cell.forward - self.is_reverse = is_reverse - self.time_major = time_major - self.batch_index, self.time_step_index = (1, 0) if time_major else (0, - 1) + def __init__(self, cells): + super(StackedRNNCell, self).__init__() + self.cells = LayerList(cells) - def forward(self, - inputs, - initial_states=None, - sequence_length=None, - **kwargs): + def forward(self, inputs, states): """ - Performs :code:`cell.forward()` repeatedly until reaches to the maximum - length of `inputs`. + Performs :code:`cell.forward` for all including cells sequentially. + Each cell's `inputs` is the `outputs` of the previous cell. And each + cell's `states` is the corresponding one in `states`. + Parameters: - inputs (Variable): A (possibly nested structure of) tensor variable[s]. - The shape of tensor should be `[batch_size, sequence_length, ...]` - for `time_major == False` or `[sequence_length, batch_size, ...]` - for `time_major == True`. It represents the inputs to be unrolled - in RNN. - initial_states (Variable, optional): A (possibly nested structure of) - tensor variable[s], representing the initial state for RNN. - If not provided, `cell.get_initial_states` would be used to produce - the initial state. Default None. - sequence_length (Variable, optional): A tensor with shape `[batch_size]`. - It stores real length of each instance, thus enables users to extract - the last valid state when past a batch element's sequence length for - correctness. If not provided, the paddings would be treated same as - non-padding inputs. Default None. - **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`. + inputs (Variable): The inputs for the first cell. Mostly it is a + float32 or float64 tensor with shape `[batch_size, input_size]`. + states (list): A list containing states for all cells orderly. + Returns: - tuple: A tuple( :code:`(final_outputs, final_states)` ) including the final \ - outputs and states, both are Tensor or nested structure of Tensor. \ - `final_outputs` has the same structure and data types as \ - the returned `outputs` of :code:`cell.forward` , and each Tenser in `final_outputs` \ - stacks all time steps' counterpart in `outputs` thus has shape `[batch_size, sequence_length, ...]` \ - for `time_major == False` or `[sequence_length, batch_size, ...]` for `time_major == True`. \ - `final_states` is the counterpart at last time step of initial states, \ - thus has the same structure with it and has tensors with same shapes \ - and data types. + tuple: A tuple( :code:`(outputs, new_states)` ). `outputs` is the \ + `outputs` of the last cell. `new_states` is a list composed \ + of all cells' `new_states`, and its structure and data type is \ + same as that of `states` argument. """ - flat_inputs = flatten(inputs) - batch_size, time_steps = (flat_inputs[0].shape[self.batch_index], - flat_inputs[0].shape[self.time_step_index]) - - if initial_states is None: - initial_states = self.cell.get_initial_states( - batch_ref=inputs, - dtype=self.cell.dtype if hasattr(self.cell, "dtype") else - self.cell.parameters()[0].dtype, - batch_dim_idx=self.batch_index) - - if fluid.in_dygraph_mode(): - - class ArrayWrapper(object): - def __init__(self, x): - self.array = [x] - - def append(self, x): - self.array.append(x) - return self - - def _maybe_copy(state, new_state, step_mask): - # TODO: use where_op - new_state = layers.elementwise_mul( - new_state, step_mask, axis=0) - layers.elementwise_mul( - state, (step_mask - 1), axis=0) - return new_state - - if not self.time_major: - inputs = map_structure( - lambda x: layers.transpose(x, [1, 0] + list( - range(2, len(x.shape)))), inputs) - - if sequence_length is not None: - mask = layers.sequence_mask( - sequence_length, - maxlen=time_steps, - dtype=flatten(initial_states)[0].dtype) - mask = layers.transpose(mask, [1, 0]) + new_states = [] + for cell, state in zip(self.cells, states): + outputs, new_state = cell(inputs, state) + inputs = outputs + new_states.append(new_state) + return outputs, new_states - if self.is_reverse: - inputs = map_structure(lambda x: layers.reverse(x, axis=[0]), - inputs) - mask = layers.reverse( - mask, axis=[0]) if sequence_length is not None else None + @staticmethod + def stack_param_attr(param_attr, n): + """ + If `param_attr` is a list or tuple, convert every element in it to a + ParamAttr instance. Otherwise, repeat `param_attr` `n` times to + construct a list, and rename every one by appending a increasing index + suffix to avoid having same names when `param_attr` contains a name. + Parameters: + param_attr (list|tuple|ParamAttr): A list, tuple or something can be + converted to a ParamAttr instance by `ParamAttr._to_attr`. + n (int): The times to repeat to construct a list when `param_attr` + is not a list or tuple. + Returns: + list: A list composed of each including cell's `param_attr`. + """ + if isinstance(param_attr, (list, tuple)): + assert len(param_attr) == n, ( + "length of param_attr should be %d when it is a list/tuple" % n) + param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr] + else: + param_attrs = [] + attr = ParamAttr._to_attr(param_attr) + for i in range(n): + attr_i = copy.deepcopy(attr) + if attr.name: + attr_i.name = attr_i.name + "_" + str(i) + param_attrs.append(attr_i) + return param_attrs - states = initial_states - outputs = [] - for i in range(time_steps): - step_inputs = map_structure(lambda x: x[i], inputs) - step_outputs, new_states = self.cell(step_inputs, states, - **kwargs) - if sequence_length is not None: - new_states = map_structure( - partial( - _maybe_copy, step_mask=mask[i]), - states, - new_states) - states = new_states - outputs = map_structure( - lambda x: ArrayWrapper(x), - step_outputs) if i == 0 else map_structure( - lambda x, x_array: x_array.append(x), step_outputs, - outputs) + @property + def state_shape(self): + """ + The `state_shape` of StackedRNNCell is a list composed of each including + cell's `state_shape`. + Returns: + list: A list composed of each including cell's `state_shape`. + """ + return [cell.state_shape for cell in self.cells] - final_outputs = map_structure( - lambda x: layers.stack(x.array, axis=self.time_step_index), - outputs) - if self.is_reverse: - final_outputs = map_structure( - lambda x: layers.reverse(x, axis=self.time_step_index), - final_outputs) +class StackedLSTMCell(RNNCellBase): + """ + Wrapper allowing a stack of LSTM cells to behave as a single cell. It is used + to implement stacked LSTM. - final_states = new_states - else: - final_outputs, final_states = layers.rnn( - self.cell, - inputs, - initial_states=initial_states, - sequence_length=sequence_length, - time_major=self.time_major, - is_reverse=self.is_reverse, - **kwargs) - return final_outputs, final_states + The formula for LSTM used here is as follows: + .. math:: + i_{t} & = \sigma(W_{x_{i}}x_{t} + b_{x_{i}} + W_{h_{i}}h_{t-1} + b_{h_{i}}) + f_{t} & = \sigma(W_{x_{f}}x_{t} + b_{x_{f}} + W_{h_{f}}h_{t-1} + b_{h_{f}}) + o_{t} & = \sigma(W_{x_{o}}x_{t} + b_{x_{o}} + W_{h_{o}}h_{t-1} + b_{h_{o}}) + c_{t} & = f_{t}c_{t-1} + i_{t} \\tanh (W_{x_{c}}x_{t} + b_{x_{c}} + W_{h_{c}}h_{t-1} + b_{h_{c}}) + h_{t} & = o_{t} \\tanh (c_{t}) -class BidirectionalRNN(Layer): - """ - Wrapper for bidirectional RNN. It assembles two RNNCell instances to perform - forward and backward RNN separately, and merge outputs of these two RNN - according to `merge_mode`. Parameters: - cell_fw (RNNCell): A RNNCell instance used for forward RNN. - cell_bw (RNNCell): A RNNCell instance used for backward RNN. - merge_mode (str|None, optional): The way to merget outputs of forward and - backward RNN. It can be `concat`, `sum`, `ave`, `mul`, `zip` and None, - where None stands for make the two `outputs` as a tuple, `zip` stands - for make each two corresponding tensors of the two `outputs` as a tuple. - Default `concat` + input_size (int): The input size for the first LSTM cell. + hidden_size (int): The hidden size for every LSTM cell. + num_layers(int, optional): The number of LSTM to be stacked. Default 1. + dropout(float, optional): The dropout probability applied on the outputs + of each LSTM cell except the last one. 0 for no dropout. Default 0.0 + param_attr (list|tuple|ParamAttr): A list, tuple or something can be + converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is + a list or tuple, it's length must equal to `num_layers`. Otherwise, + construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`. + Default None. + bias_attr (list|tuple|ParamAttr): A list, tuple or something can be + converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is + a list or tuple, it's length must equal to `num_layers`. Otherwise, + construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`. + Default None. + dtype(string, optional): The data type used in this cell. It can be + float32 or float64. Default float32. + Examples: .. code-block:: python import paddle - from paddle.incubate.hapi.text import StackedLSTMCell, BidirectionalRNN inputs = paddle.rand((2, 4, 32)) - cell_fw = StackedLSTMCell(32, 64) - cell_bw = StackedLSTMCell(32, 64) - bi_rnn = BidirectionalRNN(cell_fw, cell_bw) - outputs, _ = bi_rnn(inputs) # [2, 4, 128] + cell = paddle.StackedLSTMCell(input_size=32, hidden_size=64) + rnn = paddle.RNN(cell=cell) + outputs, _ = rnn(inputs) # [2, 4, 64] """ def __init__(self, - cell_fw, - cell_bw, - merge_mode='concat', - time_major=False, - cell_cls=None, - **kwargs): - super(BidirectionalRNN, self).__init__() - self.rnn_fw = RNN(cell_fw, is_reverse=False, time_major=time_major) - self.rnn_bw = RNN(cell_bw, is_reverse=True, time_major=time_major) - if merge_mode == 'concat': - self.merge_func = lambda x, y: layers.concat([x, y], -1) - elif merge_mode == 'sum': - self.merge_func = lambda x, y: layers.elementwise_add(x, y) - elif merge_mode == 'ave': - self.merge_func = lambda x, y: layers.scale( - layers.elementwise_add(x, y), 0.5) - elif merge_mode == 'mul': - self.merge_func = lambda x, y: layers.elementwise_mul(x, y) - elif merge_mode == 'zip': - self.merge_func = lambda x, y: (x, y) - elif merge_mode is None: - self.merge_func = None - else: - raise ValueError('Unsupported value for `merge_mode`: %s' % - merge_mode) + input_size, + hidden_size, + num_layers=1, + dropout=0.0, + param_attr=None, + bias_attr=None, + dtype="float32"): + super(StackedLSTMCell, self).__init__() + self.hidden_size = hidden_size + self.input_size = input_size + self.num_layers = num_layers + self.dropout = dropout + param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers) + bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers) - def forward(self, - inputs, - initial_states=None, - sequence_length=None, - **kwargs): - """ - Performs forward and backward RNN separately, and merge outputs of these - two RNN according to `merge_mode`. - Parameters: - inputs (Variable): A (possibly nested structure of) tensor variable[s]. - The shape of tensor should be `[batch_size, sequence_length, ...]` - for `time_major == False` or `[sequence_length, batch_size, ...]` - for `time_major == True`. It represents the inputs to be unrolled - in both forward and backward RNN. - initial_states (Variable|list|tuple): If it is a list or tuple, its - length should be 2 to include initial states of forward and backward - RNN separately. Otherwise it would be used twice for the two RNN. - If None, `cell.get_initial_states` would be used to produce the initial - states. Default None. - sequence_length (Variable, optional): A tensor with shape `[batch_size]`. - It stores real length of each instance, thus enables users to extract - the last valid state when past a batch element's sequence length for - correctness. If not provided, the paddings would be treated same as - non-padding inputs. Default None. - **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`. - Returns: - tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \ - is produced by merge outputs of forward and backward RNN according \ - to `merge_mode`; similarly, `final_states` is produced by merge \ - `final_states` of forward and backward RNN. - """ - if isinstance(initial_states, (list, tuple)): - assert len( - initial_states - ) == 2, "length of initial_states should be 2 when it is a list/tuple" - else: - initial_states = [initial_states, initial_states] - outputs_fw, states_fw = self.rnn_fw(inputs, initial_states[0], - sequence_length, **kwargs) - outputs_bw, states_bw = self.rnn_bw(inputs, initial_states[1], - sequence_length, **kwargs) - outputs = map_structure(self.merge_func, outputs_fw, - outputs_bw) if self.merge_func else (outputs_fw, - outputs_bw) - final_states = map_structure( - self.merge_func, states_fw, - states_bw) if self.merge_func else (states_fw, states_bw) - return outputs, final_states + self.cells = [] + for i in range(num_layers): + self.cells.append( + self.add_sublayer( + "lstm_%d" % i, + LSTMCell( + input_size=input_size if i == 0 else hidden_size, + hidden_size=hidden_size, + param_attr=param_attrs[i], + bias_attr=bias_attrs[i], + dtype=dtype))) - @staticmethod - def bidirect_param_attr(param_attr): + def forward(self, inputs, states): """ - Converts `param_attr` to a pair of `param_attr` when it is not a list - or tuple with length 2, also rename every one by appending a suffix to - avoid having same names when `param_attr` contains a name. + Performs the stacked LSTM cells sequentially. Each cell's `inputs` is + the `outputs` of the previous cell. And each cell's `states` is the + corresponding one in `states`. Parameters: - param_attr (list|tuple|ParamAttr): A list, tuple or something can be - converted to a ParamAttr instance by `ParamAttr._to_attr`. When - it is a list or tuple, its length must be 2. + inputs (Variable): The inputs for the first cell. It is a float32 or + float64 tensor with shape `[batch_size, input_size]`. + states (list): A list containing states for all cells orderly. Returns: - list: A pair composed of forward and backward RNN cell's `param_attr`. + tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \ + a tensor with shape `[batch_size, hidden_size]`, corresponding \ + to :math:`h_{t}` in the formula of the last LSTM; `new_states` \ + is a list composed of every LSTM `new_states` which is a pair \ + of tensors standing for :math:`h_{t}, c_{t}` in the formula, \ + and the data type and structure of these tensors all is same \ + as that of `states`. """ - if isinstance(param_attr, (list, tuple)): - assert len( - param_attr - ) == 2, "length of param_attr should be 2 when it is a list/tuple" - param_attrs = param_attr - else: - param_attrs = [] - attr = ParamAttr._to_attr(param_attr) - attr_fw = copy.deepcopy(attr) - if attr.name: - attr_fw.name = attr_fw.name + "_fw" - param_attrs.append(attr_fw) - attr_bw = copy.deepcopy(attr) - if attr.name: - attr_bw.name = attr_bw.name + "_bw" - param_attrs.append(attr_bw) - return param_attrs + new_states = [] + for i, cell in enumerate(self.cells): + outputs, new_state = cell(inputs, states[i]) + outputs = layers.dropout( + outputs, + self.dropout, + dropout_implementation='upscale_in_train' + ) if self.dropout and i != (self.num_layers - 1) else outputs + inputs = outputs + new_states.append(new_state) + # TODO(guosheng): maybe should stack list of states as one tensor + return outputs, new_states + + @property + def state_shape(self): + """ + The `state_shape` of StackedLSTMCell is a list composed of each including + LSTM cell's `state_shape`. + Returns: + list: A list composed of each including LSTM cell's `state_shape`. + """ + return [cell.state_shape for cell in self.cells] -class LSTM(Layer): +class StackedGRUCell(RNNCellBase): """ - Applies a stacked multi-layer long short-term memory (LSTM) RNN to an input - sequence. + Wrapper allowing a stack of GRU cells to behave as a single cell. It is used + to implement stacked GRU. - The formula for LSTM used here is as follows: + The formula for GRU used here is as follows: .. math:: - i_{t} & = \sigma(W_{x_{i}}x_{t} + b_{x_{i}} + W_{h_{i}}h_{t-1} + b_{h_{i}}) - f_{t} & = \sigma(W_{x_{f}}x_{t} + b_{x_{f}} + W_{h_{f}}h_{t-1} + b_{h_{f}}) - o_{t} & = \sigma(W_{x_{o}}x_{t} + b_{x_{o}} + W_{h_{o}}h_{t-1} + b_{h_{o}}) - c_{t} & = f_{t}c_{t-1} + i_{t} \\tanh (W_{x_{c}}x_{t} + b_{x_{c}} + W_{h_{c}}h_{t-1} + b_{h_{c}}) - h_{t} & = o_{t} \\tanh (c_{t}) + + u_t & = \sigma(W_{x_{u}}x_{t} + b_{x_{u}} + W_{h_{u}}h_{t-1} + b_{h_{u}}) + + r_t & = \sigma(W_{x_{r}}x_{t} + b_{x_{r}} + W_{h_{r}}h_{t-1} + b_{h_{r}}) + + \\tilde{h_t} & = \\tanh(W_{x_{c}}x_{t} + r_t \odot (W_{h_{c}}h_{t-1} + b_{h_{c}}) + + h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t} + Parameters: - input_size (int): The input feature size for the first LSTM. - hidden_size (int): The hidden size for every LSTM. - num_layers(int, optional): The number of LSTM to be stacked. Default 1. + input_size (int): The input size for the first GRU cell. + hidden_size (int): The hidden size for every GRU cell. + num_layers(int, optional): The number of GRU to be stacked. Default 1. dropout(float, optional): The dropout probability applied on the outputs - of each LSTM except the last one. 0 for not dropout. Default 0.0 - direction (str, optional): Indicate the direction for LSTM calculation - applying on the input sequences. It can be `forward`, `backward` or - `bidirect`. If it is `backward`, calculate in the reverse order of - input sequences. If it is `bidirect`, each layer would be a - bidirectional LSTM composed of a `forward` LSTM and `backward` LSTM, - and it concatenates their outputs as outputs. Default: `forward`. - time_major (bool, optional): Indicate the data layout of Tensor included - in `input` and `output` tensors. If `False`, the data layout would - be batch major with shape `[batch_size, sequence_length, ...]`. If - `True`, the data layout would be time major with shape - `[sequence_length, batch_size, ...]`. Default: `False`. + of each GRU cell except the last one. 0 for no dropout. Default 0.0 param_attr (list|tuple|ParamAttr): A list, tuple or something can be converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is a list or tuple, it's length must equal to `num_layers`. Otherwise, @@ -1094,14 +1102,17 @@ class LSTM(Layer): Default None. dtype(string, optional): The data type used in this cell. It can be float32 or float64. Default float32. + Examples: + .. code-block:: python + import paddle - import paddle.fluid as fluid - from paddle.incubate.hapi.text import LSTM + inputs = paddle.rand((2, 4, 32)) - lstm = LSTM(input_size=32, hidden_size=64, num_layers=2) - outputs, _ = lstm(inputs) # [2, 4, 64] + cell = paddle.StackedGRUCell(input_size=32, hidden_size=64) + rnn = paddle.RNN(cell=cell) + outputs, _ = rnn(inputs) # [2, 4, 64] """ def __init__(self, @@ -1109,95 +1120,67 @@ def __init__(self, hidden_size, num_layers=1, dropout=0.0, - direction="forward", - time_major=False, param_attr=None, bias_attr=None, - dtype='float32'): - super(LSTM, self).__init__() - self.input_size = input_size + dtype="float32"): + super(StackedGRUCell, self).__init__() self.hidden_size = hidden_size + self.input_size = input_size self.num_layers = num_layers self.dropout = dropout - self.direction = direction - self.num_directions = 2 if direction == 'bidirect' else 1 - self.time_major = time_major - - if direction == 'bidirect': - param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr) - bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr) - fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0], - num_layers) - bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1], - num_layers) - fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0], - num_layers) - bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1], - num_layers) + param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers) + bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers) - # maybe design cell including both forward and backward later - merge_mode = 'concat' - rnns = [] - for i in range(num_layers): - cell_fw = StackedLSTMCell(input_size if i == 0 else ( - hidden_size * 2 if merge_mode == 'concat' else - hidden_size), hidden_size, 1, dropout, fw_param_attrs[i], - fw_bias_attrs[i], dtype) - cell_bw = StackedLSTMCell(input_size if i == 0 else ( - hidden_size * 2 if merge_mode == 'concat' else - hidden_size), hidden_size, 1, dropout, bw_param_attrs[i], - bw_bias_attrs[i], dtype) - rnns.append( - BidirectionalRNN( - cell_fw, - cell_bw, - merge_mode=merge_mode, - time_major=time_major)) - self.lstm = LayerList(rnns) - else: - lstm_cell = StackedLSTMCell(input_size, hidden_size, num_layers, - dropout, param_attr, bias_attr, dtype) - self.lstm = RNN(lstm_cell, - is_reverse=(direction == "backward"), - time_major=time_major) + self.cells = [] + for i in range(num_layers): + self.cells.append( + self.add_sublayer( + "gru_%d" % i, + GRUCell( + input_size=input_size if i == 0 else hidden_size, + hidden_size=hidden_size, + param_attr=param_attrs[i], + bias_attr=bias_attrs[i], + dtype=dtype))) - def forward(self, input, initial_states=None, sequence_length=None): + def forward(self, inputs, states): """ - Performs the stacked multi-layer LSTM layer by layer. Each LSTM's `outputs` - is the `inputs` of the subsequent one. + Performs the stacked GRU cells sequentially. Each cell's `inputs` is + the `outputs` of the previous cell. And each cell's `states` is the + corresponding one in `states`. + Parameters: - inputs (Variable): The inputs for the first LSTM. It is a float32 - or float64 tensor shaped `[batch_size, sequence_length, input_size]`. - initial_states (list|None, optional): A list containing initial states - of all stacked LSTM, and the initial states of each LSTM is a pair - of tensors shaped `[batch_size, hidden_size]`. If not provided, - use 0 as initial states. Default None. - sequence_length (Variable, optional): A tensor with shape `[batch_size]`. - It stores real length of each instance, thus enables users to extract - the last valid state when past a batch element's sequence length for - correctness. If not provided, the paddings would be treated same as - non-padding inputs. Default None. + inputs (Variable): The inputs for the first cell. It is a float32 or + float64 tensor with shape `[batch_size, input_size]`. + states (list): A list containing states for all cells orderly. + Returns: - tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \ - is the output of last LSTM and it is a tensor with shape \ - `[batch_size, sequence_length, hidden_size]` and has the same \ - data type as `inputs`, `final_states` is the counterpart of \ - `initial_states` at last time step, thus has the same structure \ - with it and has tensors with same shapes data types. + tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \ + a tensor with shape `[batch_size, hidden_size]`, corresponding \ + to :math:`h_{t}` in the formula of the last GRU; `new_states` \ + is a list composed of every GRU `new_states` which is also \ + :math:`h_{t}` in the formula, and the data type and structure \ + of these tensors all is same as that of `states`. """ - if not isinstance(self.lstm, LayerList): - return self.lstm(input, initial_states, sequence_length) - else: - if isinstance(initial_states, (list, tuple)): - assert len(initial_states) == self.num_layers, ( - "length of initial_states should be %d when it is a list|tuple" - % self.num_layers) - else: - initial_states = [initial_states] * self.num_layers - stacked_states = [] - for i in range(self.num_layers): - output, states = self.lstm[i](input, initial_states[i], - sequence_length) - input = output - stacked_states.append(states) - return output, stacked_states + new_states = [] + for i, cell in enumerate(self.cells): + outputs, new_state = cell(inputs, states[i]) + outputs = layers.dropout( + outputs, + self.dropout, + dropout_implementation='upscale_in_train' + ) if self.dropout and i != (self.num_layers - 1) else outputs + inputs = outputs + new_states.append(new_state) + return outputs, new_states + + @property + def state_shape(self): + """ + The `state_shape` of StackedGRUCell is a list composed of each including + GRU cell's `state_shape`. + + Returns: + list: A list composed of each including GRU cell's `state_shape`. + """ + return [cell.state_shape for cell in self.cells] From a88fb8887396d45175fa6da55e9800af7f739632 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Tue, 18 Aug 2020 17:56:45 +0800 Subject: [PATCH 03/14] add new progresses in rnn APIs for 2.0 --- python/paddle/fluid/layers/rnn.py | 99 ++++- python/paddle/nn/layer/rnn.py | 658 ++++++++++++++++++------------ 2 files changed, 485 insertions(+), 272 deletions(-) diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py index ecc5876852283..6260c5684488a 100644 --- a/python/paddle/fluid/layers/rnn.py +++ b/python/paddle/fluid/layers/rnn.py @@ -492,6 +492,95 @@ def rnn(cell, cell = fluid.layers.GRUCell(hidden_size=128) outputs = fluid.layers.rnn(cell=cell, inputs=inputs) """ + if in_dygraph_mode: + return _rnn_dynamic_graph(cell, inputs, initial_states, sequence_length, + time_major, is_reverse, **kwargs) + else: + return _rnn_static_graph(cell, inputs, initial_states, sequence_length, + time_major, is_reverse, **kwargs) + + +class ArrayWrapper(object): + def __init__(self, x): + self.array = [x] + + def append(self, x): + self.array.append(x) + return self + + +def _maybe_copy(state, new_state, step_mask): + """update rnn state or just pass the old state through""" + new_state = nn.elementwise_mul(new_state, step_mask, axis=0) \ + + nn.elementwise_mul(state, (1 - step_mask), axis=0) + return new_state + + +def _transpose_batch_time(x): + perm = [1, 0] + list(range(2, len(x.shape))) + return nn.transpose(x, perm) + + +def _rnn_dynamic_graph(cell, + inputs, + initial_states=None, + sequence_length=None, + time_major=False, + is_reverse=False, + **kwargs): + + time_step_index = 0 if time_major else 1 + flat_inputs = flatten(inputs) + time_steps = flat_inputs[0].shape[time_step_index] + + if not time_major: + inputs = map_structure(_transpose_batch_time, inputs) + + if sequence_length is not None: + mask = sequence_lod.sequence_mask( + sequence_length, maxlen=time_steps, dtype=inputs.dtype) + mask = nn.transpose(mask, [1, 0]) + + if is_reverse: + inputs = map_structure(lambda x: tensor.reverse(x, axis=[0]), inputs) + mask = tensor.reverse(mask, axis=[0]) \ + if sequence_length is not None else None + + states = initial_states + outputs = [] + for i in range(time_steps): + step_inputs = map_structure(lambda x: x[i], inputs) + step_outputs, new_states = cell(step_inputs, states, **kwargs) + if sequence_length is not None: + new_states = map_structure( + partial( + _maybe_copy, step_mask=mask[i]), states, new_states) + states = new_states + outputs = map_structure(lambda x: ArrayWrapper(x), + step_outputs) if i == 0 else map_structure( + lambda x, x_array: x_array.append(x), + step_outputs, outputs) + + final_outputs = map_structure( + lambda x: nn.stack(x.array, axis=time_step_index), + outputs) + + if is_reverse: + final_outputs = map_structure( + lambda x: tensor.reverse(x, axis=time_step_index), + final_outputs) + + final_states = new_states + return final_outputs, final_states + + +def _rnn_static_graph(cell, + inputs, + initial_states=None, + sequence_length=None, + time_major=False, + is_reverse=False, + **kwargs): check_type(inputs, 'inputs', (Variable, list, tuple), 'rnn') if isinstance(inputs, (list, tuple)): for i, input_x in enumerate(inputs): @@ -513,16 +602,6 @@ def rnn(cell, check_type(sequence_length, 'sequence_length', (Variable, type(None)), 'rnn') - def _maybe_copy(state, new_state, step_mask): - # TODO: use where_op - new_state = nn.elementwise_mul( - new_state, step_mask, axis=0) - nn.elementwise_mul( - state, (step_mask - 1), axis=0) - return new_state - - def _transpose_batch_time(x): - return nn.transpose(x, [1, 0] + list(range(2, len(x.shape)))) - def _switch_grad(x, stop=False): x.stop_gradient = stop return x diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py index b64f04b37879c..e93a34f59a09c 100644 --- a/python/paddle/nn/layer/rnn.py +++ b/python/paddle/nn/layer/rnn.py @@ -37,16 +37,66 @@ 'SimpleRNNCell', 'LSTMCell', 'GRUCell', - 'StackedRNNCell', - 'StackedLSTMCell', - # 'stackedGRUCell', 'RNN', - 'BidirectionalRNN', + 'BiRNN', + 'SimpleRNN', 'LSTM', - # 'GRU', + 'GRU', + # 'StackedRNNCell', + # 'StackedLSTMCell', + # 'stackedGRUCell', ] +def split_states(states, bidirectional=False, state_components=1): + if state_components == 1: + states = layers.unstack(states) + if not bidirectional: + return states + else: + return list(zip(states[::2], states[1::2])) + else: + states = tuple([layers.unstack(item) for item in states]) + if not bidirectional: + return list(zip(*states)) + else: + states = list(zip(*states)) + return list(zip(states[::2], states[1::2])) + + +def concat_states(states, bidirectional=False, state_components=1): + if state_components == 1: + return layers.stack(flatten(states)) + else: + states = flatten(states) + componnets = [] + for i in range(state_components): + componnets.append(states[i::state_components]) + return [layers.stack(item) for item in componnets] + + +def birnn(cell_fw, cell_bw, inputs, states_fw, states_bw, sequence_length, + time_major): + outputs_fw, states_fw = layers.rnn(cell_fw, + inputs, + states_fw, + sequence_length, + time_major=time_major) + + outputs_bw, states_bw = layers.rnn(cell_bw, + inputs, + states_bw, + sequence_length, + time_major=time_major, + is_reverse=True) + + outputs = map_structure(lambda x, y: layers.concat([x, y], -1), outputs_fw, + outputs_bw) + + final_states = (states_fw, states_bw) + return outputs, final_states + + class RNNCellBase(Layer): """ RNNCellBase is the base class for abstraction representing the calculations @@ -460,8 +510,8 @@ def __init__(self, cell, is_reverse=False, time_major=False): self.cell.call = self.cell.forward self.is_reverse = is_reverse self.time_major = time_major - self.batch_index, self.time_step_index = (1, 0) \ - if time_major else (0, 1) + # self.batch_index, self.time_step_index = (1, 0) \ + # if time_major else (0, 1) def forward(self, inputs, initial_states=None, sequence_length=None): """ @@ -494,9 +544,6 @@ def forward(self, inputs, initial_states=None, sequence_length=None): thus has the same structure with it and has tensors with same shapes \ and data types. """ - flat_inputs = flatten(inputs) - batch_size, time_steps = (flat_inputs[0].shape[self.batch_index], - flat_inputs[0].shape[self.time_step_index]) if initial_states is None: initial_states = self.cell.get_initial_states( @@ -504,79 +551,17 @@ def forward(self, inputs, initial_states=None, sequence_length=None): dtype=inputs.dtype, batch_dim_idx=self.batch_index) - if fluid.in_dygraph_mode(): - - class ArrayWrapper(object): - def __init__(self, x): - self.array = [x] - - def append(self, x): - self.array.append(x) - return self - - def _maybe_copy(state, new_state, step_mask): - # TODO: use where_op - new_state = layers.elementwise_mul( - new_state, step_mask, axis=0) - layers.elementwise_mul( - state, (step_mask - 1), axis=0) - return new_state - - if not self.time_major: - inputs = map_structure( - lambda x: layers.transpose(x, [1, 0] + list( - range(2, len(x.shape)))), inputs) - - if sequence_length is not None: - mask = layers.sequence_mask( - sequence_length, maxlen=time_steps, dtype=inputs.dtype) - mask = layers.transpose(mask, [1, 0]) - - if self.is_reverse: - inputs = map_structure(lambda x: layers.reverse(x, axis=[0]), - inputs) - mask = layers.reverse( - mask, axis=[0]) if sequence_length is not None else None - - states = initial_states - outputs = [] - for i in range(time_steps): - step_inputs = map_structure(lambda x: x[i], inputs) - step_outputs, new_states = self.cell(step_inputs, states) - if sequence_length is not None: - new_states = map_structure( - partial( - _maybe_copy, step_mask=mask[i]), - states, - new_states) - states = new_states - outputs = map_structure( - lambda x: ArrayWrapper(x), - step_outputs) if i == 0 else map_structure( - lambda x, x_array: x_array.append(x), step_outputs, - outputs) - - final_outputs = map_structure( - lambda x: layers.stack(x.array, axis=self.time_step_index), - outputs) - - if self.is_reverse: - final_outputs = map_structure( - lambda x: layers.reverse(x, axis=self.time_step_index), - final_outputs) - - final_states = new_states - else: - final_outputs, final_states = layers.rnn( - self.cell, - inputs, - initial_states=initial_states, - sequence_length=sequence_length, - time_major=self.time_major, - is_reverse=self.is_reverse) + final_outputs, final_states = layers.rnn( + self.cell, + inputs, + initial_states=initial_states, + sequence_length=sequence_length, + time_major=self.time_major, + is_reverse=self.is_reverse) return final_outputs, final_states -class BidirectionalRNN(Layer): +class BiRNN(Layer): """ Wrapper for bidirectional RNN. It assembles two RNNCell instances to perform forward and backward RNN separately, and merge outputs of these two RNN @@ -584,11 +569,7 @@ class BidirectionalRNN(Layer): Parameters: cell_fw (RNNCell): A RNNCell instance used for forward RNN. cell_bw (RNNCell): A RNNCell instance used for backward RNN. - merge_mode (str|None, optional): The way to merget outputs of forward and - backward RNN. It can be `concat`, `sum`, `ave`, `mul`, `zip` and None, - where None stands for make the two `outputs` as a tuple, `zip` stands - for make each two corresponding tensors of the two `outputs` as a tuple. - Default `concat` + Examples: .. code-block:: python import paddle @@ -600,32 +581,11 @@ class BidirectionalRNN(Layer): outputs, _ = bi_rnn(inputs) # [2, 4, 128] """ - def __init__(self, - cell_fw, - cell_bw, - merge_mode='concat', - time_major=False, - cell_cls=None, - **kwargs): - super(BidirectionalRNN, self).__init__() - self.rnn_fw = RNN(cell_fw, is_reverse=False, time_major=time_major) - self.rnn_bw = RNN(cell_bw, is_reverse=True, time_major=time_major) - if merge_mode == 'concat': - self.merge_func = lambda x, y: layers.concat([x, y], -1) - elif merge_mode == 'sum': - self.merge_func = lambda x, y: layers.elementwise_add(x, y) - elif merge_mode == 'ave': - self.merge_func = lambda x, y: layers.scale( - layers.elementwise_add(x, y), 0.5) - elif merge_mode == 'mul': - self.merge_func = lambda x, y: layers.elementwise_mul(x, y) - elif merge_mode == 'zip': - self.merge_func = lambda x, y: (x, y) - elif merge_mode is None: - self.merge_func = None - else: - raise ValueError('Unsupported value for `merge_mode`: %s' % - merge_mode) + def __init__(self, cell_fw, cell_bw, time_major=False): + super(BiRNN, self).__init__() + self.cell_fw = cell_fw + self.cell_bw = cell_bw + self.time_major = time_major def forward(self, inputs, @@ -659,21 +619,15 @@ def forward(self, `final_states` of forward and backward RNN. """ if isinstance(initial_states, (list, tuple)): - assert len( - initial_states - ) == 2, "length of initial_states should be 2 when it is a list/tuple" + assert len(initial_states) == 2, \ + "length of initial_states should be 2 when it is a list/tuple" else: initial_states = [initial_states, initial_states] - outputs_fw, states_fw = self.rnn_fw(inputs, initial_states[0], - sequence_length, **kwargs) - outputs_bw, states_bw = self.rnn_bw(inputs, initial_states[1], - sequence_length, **kwargs) - outputs = map_structure(self.merge_func, outputs_fw, - outputs_bw) if self.merge_func else (outputs_fw, - outputs_bw) - final_states = map_structure( - self.merge_func, states_fw, - states_bw) if self.merge_func else (states_fw, states_bw) + states_fw, states_bw = initial_states + + outputs, final_states = birnn(self.cell_fw, self.cell_bw, inputs, + states_fw, states_bw, sequence_length, + self.time_major) return outputs, final_states @staticmethod @@ -710,162 +664,342 @@ def bidirect_param_attr(param_attr): return param_attrs -class SimpleRNN(Layer): - pass +class SimpleRNN(LayerList): + def __init__(self, + input_size, + hidden_size, + num_layers=1, + nonlinearity="tanh", + direction="forward", + dropout=0., + time_major=False, + name=None): + super(SimpleRNN, self).__init__() + + if direction in ["forward", "backward"]: + is_reverse = direction == "backward" + cell = SimpleRNNCell(input_size, hidden_size, nonlinearity) + self.append(RNN(cell, is_reverse, time_major)) + for i in range(1, num_layers): + cell = SimpleRNNCell(hidden_size, hidden_size, nonlinearity) + self.append(RNN(cell, is_reverse, time_major)) + elif direction == "bidirectional": + cell_fw = SimpleRNNCell(input_size, hidden_size, nonlinearity) + cell_bw = SimpleRNNCell(input_size, hidden_size, nonlinearity) + self.append(BiRNN(cell_fw, cell_bw, time_major)) + for i in range(1, num_layers): + cell_fw = SimpleRNNCell(2 * hidden_size, hidden_size, + nonlinearity) + cell_bw = SimpleRNNCell(2 * hidden_size, hidden_size, + nonlinearity) + self.append(BiRNN(cell_fw, cell_bw, time_major)) + else: + raise ValueError( + "direction should be forward, backward or bidirectional, " + "received direction = {}".format(direction)) + self.dropout = dropout + self.num_directions = 2 if direction == "bidirectional" else 1 + self.time_major = time_major + self.num_layers = num_layers -class LSTM(Layer): - """ - Applies a stacked multi-layer long short-term memory (LSTM) RNN to an input - sequence. + def forward(self, inputs, initial_states=None, sequence_length=None): + batch_index = 1 if self.time_major else 0 + batch_size = inputs.shape[batch_index] if fluid.in_dygraph_mode() \ + else layers.shape(inputs)[batch_index] + if initial_states is None: + state_shape = (self.num_directions * self.num_layers, batch_size, + self.hidden_size) + initial_states = layers.zeros(state_shape, dtype=inputs.dtype) + + states = split_states(initial_states, self.num_directions == 2) + final_states = [] + for i, rnn_layer in enumerate(self): + if i > 0: + inputs = layers.dropout( + inputs, + self.dropout, + dropout_implementation="upscale_in_train") + outputs, final_state = rnn_layer(inputs, states[i], sequence_length) + final_states.append(final_state) + inputs = outputs - The formula for LSTM used here is as follows: + final_states = concat_states(final_states, self.num_directions == 2) + return outputs, final_states - .. math:: - i_{t} & = \sigma(W_{x_{i}}x_{t} + b_{x_{i}} + W_{h_{i}}h_{t-1} + b_{h_{i}}) - f_{t} & = \sigma(W_{x_{f}}x_{t} + b_{x_{f}} + W_{h_{f}}h_{t-1} + b_{h_{f}}) - o_{t} & = \sigma(W_{x_{o}}x_{t} + b_{x_{o}} + W_{h_{o}}h_{t-1} + b_{h_{o}}) - c_{t} & = f_{t}c_{t-1} + i_{t} \\tanh (W_{x_{c}}x_{t} + b_{x_{c}} + W_{h_{c}}h_{t-1} + b_{h_{c}}) - h_{t} & = o_{t} \\tanh (c_{t}) - - Parameters: - input_size (int): The input feature size for the first LSTM. - hidden_size (int): The hidden size for every LSTM. - num_layers(int, optional): The number of LSTM to be stacked. Default 1. - dropout(float, optional): The dropout probability applied on the outputs - of each LSTM except the last one. 0 for not dropout. Default 0.0 - direction (str, optional): Indicate the direction for LSTM calculation - applying on the input sequences. It can be `forward`, `backward` or - `bidirect`. If it is `backward`, calculate in the reverse order of - input sequences. If it is `bidirect`, each layer would be a - bidirectional LSTM composed of a `forward` LSTM and `backward` LSTM, - and it concatenates their outputs as outputs. Default: `forward`. - time_major (bool, optional): Indicate the data layout of Tensor included - in `input` and `output` tensors. If `False`, the data layout would - be batch major with shape `[batch_size, sequence_length, ...]`. If - `True`, the data layout would be time major with shape - `[sequence_length, batch_size, ...]`. Default: `False`. - param_attr (list|tuple|ParamAttr): A list, tuple or something can be - converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is - a list or tuple, it's length must equal to `num_layers`. Otherwise, - construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`. - Default None. - bias_attr (list|tuple|ParamAttr): A list, tuple or something can be - converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is - a list or tuple, it's length must equal to `num_layers`. Otherwise, - construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`. - Default None. - dtype(string, optional): The data type used in this cell. It can be - float32 or float64. Default float32. - Examples: - .. code-block:: python - import paddle - import paddle.fluid as fluid - from paddle.incubate.hapi.text import LSTM - inputs = paddle.rand((2, 4, 32)) - lstm = LSTM(input_size=32, hidden_size=64, num_layers=2) - outputs, _ = lstm(inputs) # [2, 4, 64] - """ +class LSTM(LayerList): def __init__(self, input_size, hidden_size, num_layers=1, direction="forward", - dropout=0.0, + dropout=0., time_major=False, name=None): super(LSTM, self).__init__() - self.input_size = input_size - self.hidden_size = hidden_size - self.num_layers = num_layers + + if direction in ["forward", "backward"]: + is_reverse = direction == "backward" + cell = LSTMCell(input_size, hidden_size) + self.append(RNN(cell, is_reverse, time_major)) + for i in range(1, num_layers): + cell = LSTMCell(hidden_size, hidden_size) + self.append(RNN(cell, is_reverse, time_major)) + elif direction == "bidirectional": + cell_fw = LSTMCell(input_size, hidden_size) + cell_bw = LSTMCell(input_size, hidden_size) + self.append(BiRNN(cell_fw, cell_bw, time_major)) + for i in range(1, num_layers): + cell_fw = LSTMCell(2 * hidden_size, hidden_size) + cell_bw = LSTMCell(2 * hidden_size, hidden_size) + self.append(BiRNN(cell_fw, cell_bw, time_major)) + else: + raise ValueError( + "direction should be forward, backward or bidirectional, " + "received direction = {}".format(direction)) + self.dropout = dropout - self.direction = direction - self.num_directions = 2 if direction == 'bidirect' else 1 + self.num_directions = 2 if direction == "bidirectional" else 1 self.time_major = time_major + self.num_layers = num_layers - if direction == 'bidirect': - param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr) - bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr) - fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0], - num_layers) - bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1], - num_layers) - fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0], - num_layers) - bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1], - num_layers) - - # maybe design cell including both forward and backward later - merge_mode = 'concat' - rnns = [] - for i in range(num_layers): - cell_fw = StackedLSTMCell(input_size if i == 0 else ( - hidden_size * 2 if merge_mode == 'concat' else - hidden_size), hidden_size, 1, dropout, fw_param_attrs[i], - fw_bias_attrs[i], dtype) - cell_bw = StackedLSTMCell(input_size if i == 0 else ( - hidden_size * 2 if merge_mode == 'concat' else - hidden_size), hidden_size, 1, dropout, bw_param_attrs[i], - bw_bias_attrs[i], dtype) - rnns.append( - BidirectionalRNN( - cell_fw, - cell_bw, - merge_mode=merge_mode, - time_major=time_major)) - self.lstm = LayerList(rnns) - else: - lstm_cell = StackedLSTMCell(input_size, hidden_size, num_layers, - dropout, param_attr, bias_attr, dtype) - self.lstm = RNN(lstm_cell, - is_reverse=(direction == "backward"), - time_major=time_major) + def forward(self, inputs, initial_states=None, sequence_length=None): + batch_index = 1 if self.time_major else 0 + batch_size = inputs.shape[batch_index] if fluid.in_dygraph_mode() \ + else layers.shape(inputs)[batch_index] + if initial_states is None: + state_shape = (self.num_directions * self.num_layers, batch_size, + self.hidden_size) + init_h = layers.zeros(state_shape, dtype=inputs.dtype) + init_c = layers.zeros(state_shape, dtype=inputs.dtype) + initial_states = (init_h, init_c) + + states = split_states(initial_states, self.num_directions == 2, 2) + final_states = [] + for i, rnn_layer in enumerate(self): + if i > 0: + inputs = layers.dropout( + inputs, + self.dropout, + dropout_implementation="upscale_in_train") + outputs, final_state = rnn_layer(inputs, states[i], sequence_length) + final_states.append(final_state) + inputs = outputs - def forward(self, input, initial_states=None, sequence_length=None): - """ - Performs the stacked multi-layer LSTM layer by layer. Each LSTM's `outputs` - is the `inputs` of the subsequent one. - Parameters: - inputs (Variable): The inputs for the first LSTM. It is a float32 - or float64 tensor shaped `[batch_size, sequence_length, input_size]`. - initial_states (list|None, optional): A list containing initial states - of all stacked LSTM, and the initial states of each LSTM is a pair - of tensors shaped `[batch_size, hidden_size]`. If not provided, - use 0 as initial states. Default None. - sequence_length (Variable, optional): A tensor with shape `[batch_size]`. - It stores real length of each instance, thus enables users to extract - the last valid state when past a batch element's sequence length for - correctness. If not provided, the paddings would be treated same as - non-padding inputs. Default None. - Returns: - tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \ - is the output of last LSTM and it is a tensor with shape \ - `[batch_size, sequence_length, hidden_size]` and has the same \ - data type as `inputs`, `final_states` is the counterpart of \ - `initial_states` at last time step, thus has the same structure \ - with it and has tensors with same shapes data types. - """ - if not isinstance(self.lstm, LayerList): - return self.lstm(input, initial_states, sequence_length) + final_states = concat_states(final_states, self.num_directions == 2, 2) + return outputs, final_states + + +class GRU(LayerList): + def __init__(self, + input_size, + hidden_size, + num_layers=1, + direction="forward", + dropout=0., + time_major=False, + name=None): + super(GRU, self).__init__() + + if direction in ["forward", "backward"]: + is_reverse = direction == "backward" + cell = GRUCell(input_size, hidden_size) + self.append(RNN(cell, is_reverse, time_major)) + for i in range(1, num_layers): + cell = GRUCell(hidden_size, hidden_size) + self.append(RNN(cell, is_reverse, time_major)) + elif direction == "bidirectional": + cell_fw = GRUCell(input_size, hidden_size) + cell_bw = GRUCell(input_size, hidden_size) + self.append(BiRNN(cell_fw, cell_bw, time_major)) + for i in range(1, num_layers): + cell_fw = GRUCell(2 * hidden_size, hidden_size) + cell_bw = GRUCell(2 * hidden_size, hidden_size) + self.append(BiRNN(cell_fw, cell_bw, time_major)) else: - if isinstance(initial_states, (list, tuple)): - assert len(initial_states) == self.num_layers, ( - "length of initial_states should be %d when it is a list|tuple" - % self.num_layers) - else: - initial_states = [initial_states] * self.num_layers - stacked_states = [] - for i in range(self.num_layers): - output, states = self.lstm[i](input, initial_states[i], - sequence_length) - input = output - stacked_states.append(states) - return output, stacked_states - - -class GRU(Layer): - pass + raise ValueError( + "direction should be forward, backward or bidirectional, " + "received direction = {}".format(direction)) + + self.dropout = dropout + self.num_directions = 2 if direction == "bidirectional" else 1 + self.time_major = time_major + self.num_layers = num_layers + + def forward(self, inputs, initial_states=None, sequence_length=None): + batch_index = 1 if self.time_major else 0 + batch_size = inputs.shape[batch_index] if fluid.in_dygraph_mode() \ + else layers.shape(inputs)[batch_index] + if initial_states is None: + state_shape = (self.num_directions * self.num_layers, batch_size, + self.hidden_size) + initial_states = layers.zeros(state_shape, dtype=inputs.dtype) + states = split_states(initial_states, self.num_directions == 2) + + final_states = [] + for i, rnn_layer in enumerate(self): + if i > 0: + inputs = layers.dropout( + inputs, + self.dropout, + dropout_implementation="upscale_in_train") + outputs, final_state = rnn_layer(inputs, states[i], sequence_length) + final_states.append(final_state) + inputs = outputs + + final_states = concat_states(final_states, self.num_directions == 2) + return outputs, final_states + + +# class LSTM(Layer): +# """ +# Applies a stacked multi-layer long short-term memory (LSTM) RNN to an input +# sequence. + +# The formula for LSTM used here is as follows: + +# .. math:: +# i_{t} & = \sigma(W_{x_{i}}x_{t} + b_{x_{i}} + W_{h_{i}}h_{t-1} + b_{h_{i}}) +# f_{t} & = \sigma(W_{x_{f}}x_{t} + b_{x_{f}} + W_{h_{f}}h_{t-1} + b_{h_{f}}) +# o_{t} & = \sigma(W_{x_{o}}x_{t} + b_{x_{o}} + W_{h_{o}}h_{t-1} + b_{h_{o}}) +# c_{t} & = f_{t}c_{t-1} + i_{t} \\tanh (W_{x_{c}}x_{t} + b_{x_{c}} + W_{h_{c}}h_{t-1} + b_{h_{c}}) +# h_{t} & = o_{t} \\tanh (c_{t}) + +# Parameters: +# input_size (int): The input feature size for the first LSTM. +# hidden_size (int): The hidden size for every LSTM. +# num_layers(int, optional): The number of LSTM to be stacked. Default 1. +# dropout(float, optional): The dropout probability applied on the outputs +# of each LSTM except the last one. 0 for not dropout. Default 0.0 +# direction (str, optional): Indicate the direction for LSTM calculation +# applying on the input sequences. It can be `forward`, `backward` or +# `bidirect`. If it is `backward`, calculate in the reverse order of +# input sequences. If it is `bidirect`, each layer would be a +# bidirectional LSTM composed of a `forward` LSTM and `backward` LSTM, +# and it concatenates their outputs as outputs. Default: `forward`. +# time_major (bool, optional): Indicate the data layout of Tensor included +# in `input` and `output` tensors. If `False`, the data layout would +# be batch major with shape `[batch_size, sequence_length, ...]`. If +# `True`, the data layout would be time major with shape +# `[sequence_length, batch_size, ...]`. Default: `False`. +# param_attr (list|tuple|ParamAttr): A list, tuple or something can be +# converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is +# a list or tuple, it's length must equal to `num_layers`. Otherwise, +# construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`. +# Default None. +# bias_attr (list|tuple|ParamAttr): A list, tuple or something can be +# converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is +# a list or tuple, it's length must equal to `num_layers`. Otherwise, +# construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`. +# Default None. +# dtype(string, optional): The data type used in this cell. It can be +# float32 or float64. Default float32. +# Examples: +# .. code-block:: python +# import paddle +# import paddle.fluid as fluid +# from paddle.incubate.hapi.text import LSTM +# inputs = paddle.rand((2, 4, 32)) +# lstm = LSTM(input_size=32, hidden_size=64, num_layers=2) +# outputs, _ = lstm(inputs) # [2, 4, 64] +# """ + +# def __init__(self, +# input_size, +# hidden_size, +# num_layers=1, +# direction="forward", +# dropout=0.0, +# time_major=False, +# name=None): +# super(LSTM, self).__init__() +# self.input_size = input_size +# self.hidden_size = hidden_size +# self.num_layers = num_layers +# self.dropout = dropout +# self.direction = direction +# self.num_directions = 2 if direction == 'bidirect' else 1 +# self.time_major = time_major + +# if direction == 'bidirect': +# param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr) +# bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr) +# fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0], +# num_layers) +# bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1], +# num_layers) +# fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0], +# num_layers) +# bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1], +# num_layers) + +# # maybe design cell including both forward and backward later +# merge_mode = 'concat' +# rnns = [] +# for i in range(num_layers): +# cell_fw = StackedLSTMCell(input_size if i == 0 else ( +# hidden_size * 2 if merge_mode == 'concat' else +# hidden_size), hidden_size, 1, dropout, fw_param_attrs[i], +# fw_bias_attrs[i], dtype) +# cell_bw = StackedLSTMCell(input_size if i == 0 else ( +# hidden_size * 2 if merge_mode == 'concat' else +# hidden_size), hidden_size, 1, dropout, bw_param_attrs[i], +# bw_bias_attrs[i], dtype) +# rnns.append( +# BidirectionalRNN( +# cell_fw, +# cell_bw, +# merge_mode=merge_mode, +# time_major=time_major)) +# self.lstm = LayerList(rnns) +# else: +# lstm_cell = StackedLSTMCell(input_size, hidden_size, num_layers, +# dropout, param_attr, bias_attr, dtype) +# self.lstm = RNN(lstm_cell, +# is_reverse=(direction == "backward"), +# time_major=time_major) + +# def forward(self, input, initial_states=None, sequence_length=None): +# """ +# Performs the stacked multi-layer LSTM layer by layer. Each LSTM's `outputs` +# is the `inputs` of the subsequent one. +# Parameters: +# inputs (Variable): The inputs for the first LSTM. It is a float32 +# or float64 tensor shaped `[batch_size, sequence_length, input_size]`. +# initial_states (list|None, optional): A list containing initial states +# of all stacked LSTM, and the initial states of each LSTM is a pair +# of tensors shaped `[batch_size, hidden_size]`. If not provided, +# use 0 as initial states. Default None. +# sequence_length (Variable, optional): A tensor with shape `[batch_size]`. +# It stores real length of each instance, thus enables users to extract +# the last valid state when past a batch element's sequence length for +# correctness. If not provided, the paddings would be treated same as +# non-padding inputs. Default None. +# Returns: +# tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \ +# is the output of last LSTM and it is a tensor with shape \ +# `[batch_size, sequence_length, hidden_size]` and has the same \ +# data type as `inputs`, `final_states` is the counterpart of \ +# `initial_states` at last time step, thus has the same structure \ +# with it and has tensors with same shapes data types. +# """ +# if not isinstance(self.lstm, LayerList): +# return self.lstm(input, initial_states, sequence_length) +# else: +# if isinstance(initial_states, (list, tuple)): +# assert len(initial_states) == self.num_layers, ( +# "length of initial_states should be %d when it is a list|tuple" +# % self.num_layers) +# else: +# initial_states = [initial_states] * self.num_layers +# stacked_states = [] +# for i in range(self.num_layers): +# output, states = self.lstm[i](input, initial_states[i], +# sequence_length) +# input = output +# stacked_states.append(states) +# return output, stacked_states # TODO: restucture RNN layers From 5fb65ba119f42f20e6af9fdc9fcefbc848759a30 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Thu, 20 Aug 2020 20:12:13 +0800 Subject: [PATCH 04/14] refine rnn APIs and docstrings. --- python/paddle/fluid/layers/rnn.py | 143 ++- python/paddle/nn/functional/__init__.py | 5 +- python/paddle/nn/functional/rnn.py | 8 +- python/paddle/nn/layer/rnn.py | 1515 +++++++++++------------ 4 files changed, 826 insertions(+), 845 deletions(-) diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py index e922cd48267c8..ae6539370f25f 100644 --- a/python/paddle/fluid/layers/rnn.py +++ b/python/paddle/fluid/layers/rnn.py @@ -438,24 +438,22 @@ def rnn(cell, is_reverse=False, **kwargs): """ - :api_attr: Static Graph - rnn creates a recurrent neural network specified by RNNCell `cell`, - which performs :code:`cell.call()` repeatedly until reaches to the maximum - length of `inputs`. + which performs :code:`cell.call()` (for dygraph mode :code:`cell.forward`) + repeatedly until reaches to the maximum length of `inputs`. - Parameters: - cell(RNNCell): An instance of `RNNCell`. - inputs(Variable): A (possibly nested structure of) tensor variable[s]. + Arguments: + cell(RNNCellBase): An instance of `RNNCellBase`. + inputs(Tensor): A (possibly nested structure of) tensor[s]. The shape of tensor should be `[batch_size, sequence_length, ...]` for `time_major == False` or `[sequence_length, batch_size, ...]` for `time_major == True`. It represents the inputs to be unrolled in RNN. - initial_states(Variable, optional): A (possibly nested structure of) - tensor variable[s], representing the initial state for RNN. + initial_states(Tensor, optional): A (possibly nested structure of) + tensor[s], representing the initial state for RNN. If not provided, `cell.get_initial_states` would be used to produce the initial state. Default None. - sequence_length(Variable, optional): A tensor with shape `[batch_size]`. + sequence_length(Tensor, optional): A tensor with shape `[batch_size]`. It stores real length of each instance, thus enables users to extract the last valid state when past a batch element's sequence length for correctness. If not provided, the paddings would be treated same as @@ -470,30 +468,33 @@ def rnn(cell, **kwargs: Additional keyword arguments. Arguments passed to `cell.call`. Returns: - tuple: A tuple( :code:`(final_outputs, final_states)` ) including the final \ - outputs and states, both are Tensor or nested structure of Tensor. \ - `final_outputs` has the same structure and data types as \ - the returned `outputs` of :code:`cell.call` , and each Tenser in `final_outputs` \ - stacks all time steps' counterpart in `outputs` thus has shape `[batch_size, sequence_length, ...]` \ - for `time_major == False` or `[sequence_length, batch_size, ...]` for `time_major == True`. \ - `final_states` is the counterpart at last time step of initial states, \ - thus has the same structure with it and has tensors with same shapes \ - and data types. + (outputs, final_states) + outputs (Tensor|list|tuple): the output sequence. Tensor or nested + structure of Tensor. + If `time_major` is True, the shape of each tensor in outpus is + `[time_steps, batch_size, hidden_size]`, else + `[batch_size, time_steps, hidden_size]`. + final_states (Tensor|list|tuple): final states. A (possibly nested structure of) + tensor[s], representing the final state for RNN. It has the same + structure of intial state. Each tensor in final states has the same + shape and dtype as the corresponding tensor in initial states. Examples: .. code-block:: python - - import paddle.fluid as fluid - inputs = fluid.data(name="inputs", - shape=[-1, 32, 128], - dtype="float32") - cell = fluid.layers.GRUCell(hidden_size=128) - outputs = fluid.layers.rnn(cell=cell, inputs=inputs) + import paddle + paddle.disable_static() + + cell = paddle.nn.SimpleRNNCell(16, 32) + + inputs = paddle.rand((4, 23, 16)) + prev_h = paddle.randn((4, 32)) + outputs, final_states = paddle.nn.functional.rnn(cell, inputs, prev_h) + """ - if in_dygraph_mode: + if in_dygraph_mode(): return _rnn_dynamic_graph(cell, inputs, initial_states, sequence_length, time_major, is_reverse, **kwargs) else: @@ -529,7 +530,6 @@ def _rnn_dynamic_graph(cell, time_major=False, is_reverse=False, **kwargs): - time_step_index = 0 if time_major else 1 flat_inputs = flatten(inputs) time_steps = flat_inputs[0].shape[time_step_index] @@ -589,16 +589,6 @@ def _rnn_static_graph(cell, ['float32', 'float64'], 'rnn') check_type(initial_states, 'initial_states', (Variable, list, tuple, type(None)), 'rnn') - if isinstance(initial_states, (list, tuple)): - states = map_structure(lambda x: x, initial_states)[0] - for i, state in enumerate(states): - if isinstance(state, (list, tuple)): - for j, state_j in enumerate(state): - check_variable_and_dtype(state_j, 'state_j[' + str(j) + ']', - ['float32', 'float64'], 'rnn') - else: - check_variable_and_dtype(state, 'states[' + str(i) + ']', - ['float32', 'float64'], 'rnn') check_type(sequence_length, 'sequence_length', (Variable, type(None)), 'rnn') @@ -661,6 +651,83 @@ def _switch_grad(x, stop=False): return (final_outputs, final_states) +def birnn(cell_fw, cell_bw, inputs, initial_states, sequence_length, + time_major): + """ + birnn creates a bidirectional recurrent neural network specified by + RNNCell `cell_fw` and `cell_bw`, which performs :code:`cell.call()` + (for dygraph mode :code:`cell.forward`) repeatedly until reaches to + the maximum length of `inputs` and then concat the ouputs for both RNNs + along the last axis. + + Arguments: + cell(RNNCellBase): An instance of `RNNCellBase`. + inputs(Tensor): A (possibly nested structure of) tensor[s]. + The shape of tensor should be `[batch_size, sequence_length, ...]` + for `time_major == False` or `[sequence_length, batch_size, ...]` + for `time_major == True`. It represents the inputs to be unrolled + in RNN. + initial_states(tuple, optional): A tuple of + If not provided, `cell.get_initial_states` would be used to produce + the each initial state. Defaults to None. + sequence_length(Tensor, optional): A tensor with shape `[batch_size]`. + It stores real length of each instance, thus enables users to extract + the last valid state when past a batch element's sequence length for + correctness. If not provided, the paddings would be treated same as + non-padding inputs. Default None. + time_major(bool, optional): Indicate the data layout of Tensor included + in `input` and `output` tensors. If `False`, the data layout would + be batch major with shape `[batch_size, time_steps, ...]`. If + `True`, the data layout would be time major with shape + `[time_steps, batch_size, ...]`. Default: `False`. + **kwargs: Additional keyword arguments. Arguments passed to `cell.call`. + + Returns: + outputs (Tensor): A (possibly nested structure of) tensor variable[s], + the outputs of the bidirectional RNN. It is the concatenation + of the outputs for both the forward RNN and backward RNN along + the last axis. + The shape of tensor should be `[batch_size, time_steps, ...]` + for `time_major == False` or `[time_steps, batch_size, ...]` + for `time_major == True`. + final_states (tuple): A tuple of the final states of the forward + cell and backward cell. + + + Examples: + + .. code-block:: python + + import paddle + paddle.disable_static() + + cell_fw = LSTMCell(16, 32) + cell_bw = LSTMCell(16, 32) + inputs = paddle.rand((2, 23, 16)) + outputs, final_states = paddle.nn.functional.birnn(cell_fw, cell_bw, inputs) + + """ + states_fw, states_bw = initial_states + outputs_fw, states_fw = rnn(cell_fw, + inputs, + states_fw, + sequence_length, + time_major=time_major) + + outputs_bw, states_bw = rnn(cell_bw, + inputs, + states_bw, + sequence_length, + time_major=time_major, + is_reverse=True) + + outputs = map_structure(lambda x, y: tensor.concat([x, y], -1), outputs_fw, + outputs_bw) + + final_states = (states_fw, states_bw) + return outputs, final_states + + class Decoder(object): """ :api_attr: Static Graph diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py index bc71b8bdf06d2..81ab6c62f0915 100644 --- a/python/paddle/nn/functional/__init__.py +++ b/python/paddle/nn/functional/__init__.py @@ -156,9 +156,8 @@ from .pooling import pool3d #DEFINE_ALIAS from .pooling import adaptive_pool2d #DEFINE_ALIAS from .pooling import adaptive_pool3d #DEFINE_ALIAS -# from .rnn import gru_unit #DEFINE_ALIAS -# from .rnn import lstm #DEFINE_ALIAS -# from .rnn import lstm_unit #DEFINE_ALIAS +from .rnn import rnn #DEFINE_ALIAS +from .rnn import birnn #DEFINE_ALIAS from .vision import affine_channel #DEFINE_ALIAS from .vision import affine_grid #DEFINE_ALIAS from .vision import anchor_generator #DEFINE_ALIAS diff --git a/python/paddle/nn/functional/rnn.py b/python/paddle/nn/functional/rnn.py index 520cf44360dc3..b7a97bc5aa303 100644 --- a/python/paddle/nn/functional/rnn.py +++ b/python/paddle/nn/functional/rnn.py @@ -12,10 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# TODO: define function of recurrent neural network +from paddle.fluid.layers.rnn import rnn, birnn -__all__ = [ - # 'gru_unit', - # 'lstm', - # 'lstm_unit' -] +__all__ = ['rnn', 'birnn'] diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py index e93a34f59a09c..d2bdd4f80c42a 100644 --- a/python/paddle/nn/layer/rnn.py +++ b/python/paddle/nn/layer/rnn.py @@ -21,16 +21,14 @@ import warnings from functools import partial, reduce -from ... import fluid -from ...fluid import layers -from ...fluid import initializer as I -from ...fluid.data_feeder import convert_dtype -from ...fluid.dygraph import Layer, LayerList -from ...fluid.param_attr import ParamAttr -from ...fluid.layers import utils, BeamSearchDecoder -from ...fluid.layers.utils import map_structure, flatten, pack_sequence_as - -# TODO: define classes of recurrent neural network +import paddle +from paddle import framework +from paddle.nn import functional as F +from paddle.nn import initializer as I +from paddle.fluid.dygraph import Layer, LayerList +from paddle.fluid.layers import utils +from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as +from paddle.fluid.data_feeder import convert_dtype __all__ = [ 'RNNCellBase', @@ -42,21 +40,55 @@ 'SimpleRNN', 'LSTM', 'GRU', - # 'StackedRNNCell', - # 'StackedLSTMCell', - # 'stackedGRUCell', ] def split_states(states, bidirectional=False, state_components=1): + """ + Split states of RNN network into possibly nested list or tuple of + states of each RNN cells of the RNN network. + + Arguments: + states (Tensor|tuple|list): the concatenated states for RNN network. + When ``state_components`` is 1, states in a Tensor with shape + ``(L\*D, N, C)`` where ``L`` is the number of layers of the RNN + network, ``D`` is the number of directions of the RNN network(1 + for unidirectional RNNs and 2 for bidirectional RNNs), ``N`` is + the batch size of the input to the RNN network, ``C`` is the + hidden size of the RNN network. + + When `state_components` is larger than 1, ``states`` is a tuple of + ``state_components`` Tensors that meet the requirements described + above. + + For SimpleRNNs and GRUs, ``state_components`` is 1, and for LSTMs, + ``state_components`` is 2. + bidirectional (bool): whether the state is of a bidirectional RNN + network. Defaults to False. + state_components (int): the number of the components of the states. see + ``states`` above. Defaults to 1. + + Returns: + A nested list or tuple of RNN cell states. + If ``bidirectional`` is True, it can be indexed twice to get an RNN + cell state. The first index indicates the layer, the second index + indicates the direction. + If ``bidirectional`` is False, it can be indexed once to get an RNN + cell state. The index indicates the layer. + Note that if ``state_components`` is larger than 1, an RNN cell state + can be indexed one more time to get a tensor of shape(N, C), where + ``N`` is the batch size of the input to the RNN cell, and ``C`` is the + hidden size of the RNN cell. + """ if state_components == 1: - states = layers.unstack(states) + states = paddle.unstack(states) if not bidirectional: return states else: return list(zip(states[::2], states[1::2])) else: - states = tuple([layers.unstack(item) for item in states]) + assert len(states) == state_components + states = tuple([paddle.unstack(item) for item in states]) if not bidirectional: return list(zip(*states)) else: @@ -65,36 +97,45 @@ def split_states(states, bidirectional=False, state_components=1): def concat_states(states, bidirectional=False, state_components=1): + """ + Concatenate a possibly nested list or tuple of RNN cell states into a + compact form. + + Arguments: + states (list|tuple): a possibly nested list or tuple of RNN cell + states. + If ``bidirectional`` is True, it can be indexed twice to get an + RNN cell state. The first index indicates the layer, the second + index indicates the direction. + If ``bidirectional`` is False, it can be indexed once to get an RNN + cell state. The index indicates the layer. + Note that if ``state_components`` is larger than 1, an RNN cell + state can be indexed one more time to get a tensor of shape(N, C), + where ``N`` is the batch size of the input to the RNN cell, and + ``C`` is the hidden size of the RNN cell. + bidirectional (bool): whether the state is of a bidirectional RNN + network. Defaults to False. + state_components (int): the number of the components of the states. see + ``states`` above. Defaults to 1. + + Returns: + Concatenated states for RNN network. + When ``state_components`` is 1, states in a Tensor with shape + ``(L\*D, N, C)`` where ``L`` is the number of layers of the RNN + network, ``D`` is the number of directions of the RNN network(1 for + unidirectional RNNs and 2 for bidirectional RNNs), ``N`` is the batch + size of the input to the RNN network, ``C`` is the hidden size of the + RNN network. + + """ if state_components == 1: - return layers.stack(flatten(states)) + return paddle.stack(flatten(states)) else: states = flatten(states) componnets = [] for i in range(state_components): componnets.append(states[i::state_components]) - return [layers.stack(item) for item in componnets] - - -def birnn(cell_fw, cell_bw, inputs, states_fw, states_bw, sequence_length, - time_major): - outputs_fw, states_fw = layers.rnn(cell_fw, - inputs, - states_fw, - sequence_length, - time_major=time_major) - - outputs_bw, states_bw = layers.rnn(cell_bw, - inputs, - states_bw, - sequence_length, - time_major=time_major, - is_reverse=True) - - outputs = map_structure(lambda x, y: layers.concat([x, y], -1), outputs_fw, - outputs_bw) - - final_states = (states_fw, states_bw) - return outputs, final_states + return [paddle.stack(item) for item in componnets] class RNNCellBase(Layer): @@ -113,7 +154,7 @@ def get_initial_states(self, """ Generate initialized states according to provided shape, data type and value. - Parameters: + Arguments: batch_ref: A (possibly nested structure of) tensor variable[s]. The first dimension of the tensor will be used as batch size to initialize states. @@ -169,13 +210,13 @@ def __init__(self, shape): try: states_dtypes = self.state_dtype if dtype is None else dtype except NotImplementedError: # use fp32 as default - states_dtypes = "float32" + states_dtypes = framework.get_default_dtype() if len(flatten(states_dtypes)) == 1: dtype = flatten(states_dtypes)[0] states_dtypes = map_structure(lambda shape: dtype, states_shapes) init_states = map_structure( - lambda shape, dtype: layers.fill_constant_batch_size_like( + lambda shape, dtype: paddle.fluid.layers.fill_constant_batch_size_like( input=batch_ref, shape=shape.shape, dtype=dtype, @@ -215,20 +256,77 @@ def state_dtype(self): class SimpleRNNCell(RNNCellBase): - def __init__(self, input_size, hidden_size, nonlinearity="tanh", name=None): + r""" + Elman RNN (SimpleRNN) cell. + + The formula used is as follows: + + .. math:: + h_{t} & = \mathrm{tanh}(W_{ih}x_{t} + b_{ih} + W_{hh}h{t-1} + b_{hh}) + y_{t} & = h_{t} + + where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise + multiplication operator. + + Please refer to `Finding Structure in Time + `_ for more details. + + Arguments: + input_size (int): The input size. + hidden_size (int): The hidden size. + nonlinearity (str): The activation in the SimpleRNN cell. It can be + ``tanh`` or ``relu``. Defaults to ``tanh``. + weight_ih_attr(ParamAttr, optional): The parameter attribute for + ``weight_ih``. Default: None. + weight_hh_attr(ParamAttr, optional): The parameter attribute for + ``weight_hh``. Default: None. + bias_ih_attr (ParamAttr, optional): The parameter attribute for the + ``bias_ih``. Default: None. + bias_ih_attr (ParamAttr, optional): The parameter attribute for the + ``bias_hh``. Default: None. + name (str, optional): Name for the operation (optional, default is + None). For more information, please refer to :ref:`api_guide_Name`. + + Examples: + .. code-block:: python + import paddle + paddle.disable_static() + + x = paddle.randn((4, 16)) + prev_h = paddle.randn((4, 32)) + + cell = paddle.nn.SimpleRNNCell(16, 32) + y, h = cell(x, prev_h) + + """ + + def __init__(self, + input_size, + hidden_size, + nonlinearity="tanh", + weight_ih_attr=None, + weight_hh_attr=None, + bias_ih_attr=None, + bias_hh_attr=None, + name=None): super(SimpleRNNCell, self).__init__() std = 1.0 / math.sqrt(hidden_size) self.weight_ih = self.create_parameter( - (hidden_size, input_size), default_initializer=I.Uniform(-std, std)) + (hidden_size, input_size), + weight_ih_attr, + default_initializer=I.Uniform(-std, std)) self.weight_hh = self.create_parameter( (hidden_size, hidden_size), + weight_hh_attr, default_initializer=I.Uniform(-std, std)) self.bias_ih = self.create_parameter( (hidden_size, ), + bias_ih_attr, is_bias=True, default_initializer=I.Uniform(-std, std)) self.bias_hh = self.create_parameter( (hidden_size, ), + bias_hh_attr, is_bias=True, default_initializer=I.Uniform(-std, std)) @@ -239,18 +337,36 @@ def __init__(self, input_size, hidden_size, nonlinearity="tanh", name=None): "nonlinearity for SimpleRNNCell should be tanh or relu, " "but get {}".format(nonlinearity)) self.nonlinearity = nonlinearity - self._nonlinear_fn = layers.tanh \ + self._nonlinear_fn = paddle.tanh \ if nonlinearity == "tanh" \ - else layers.relu + else F.relu def forward(self, inputs, states=None): + """ + Given the input and previous atate, compute the output and update state. + + Arguments: + inputs (Tensor): shape `[batch_size, input_size]`, the input, + corresponding to :math:`x_t` in the formula. + states (Tensor, optional): shape `[batch_size, hidden_size]`, the + previous hidden state, corresponding to :math:`h_{t-1}` in the + formula. When states is None, zero state is used. Defaults to + None. + Returns: + (outputs, new_states) + outputs (Tensor): shape `[batch_size, hidden_size]`, the output, + corresponding to :math:`h_{t}` in the formula. + states (Tensor): shape `[batch_size, hidden_size]`, the new hidden + state, corresponding to :math:`h_{t}` in the formula. + + """ if states is None: states = self.get_initial_states(inputs, self.state_shape) pre_h = states - i2h = layers.matmul(inputs, self.weight_ih, transpose_y=True) + i2h = paddle.matmul(inputs, self.weight_ih, transpose_y=True) if self.bias_ih is not None: i2h += self.bias_ih - h2h = layers.matmul(pre_h, self.weight_hh, transpose_y=True) + h2h = paddle.matmul(pre_h, self.weight_hh, transpose_y=True) if self.bias_hh is not None: h2h += self.bias_hh h = self._nonlinear_fn(i2h + h2h) @@ -262,91 +378,119 @@ def state_shape(self): class LSTMCell(RNNCellBase): - """ + r""" Long-Short Term Memory(LSTM) RNN cell. The formula used is as follows: .. math:: - i_{t} & = \sigma(W_{x_{i}}x_{t} + b_{x_{i}} + W_{h_{i}}h_{t-1} + b_{h_{i}}) - f_{t} & = \sigma(W_{x_{f}}x_{t} + b_{x_{f}} + W_{h_{f}}h_{t-1} + b_{h_{f}}) - o_{t} & = \sigma(W_{x_{o}}x_{t} + b_{x_{o}} + W_{h_{o}}h_{t-1} + b_{h_{o}}) - c_{t} & = f_{t}c_{t-1} + i_{t} \\tanh (W_{x_{c}}x_{t} + b_{x_{c}} + W_{h_{c}}h_{t-1} + b_{h_{c}}) - h_{t} & = o_{t} \\tanh (c_{t}) + i_{t} & = \sigma(W_{ii}x_{t} + b_{ii} + W_{hi}h_{t-1} + b_{hi}) + f_{t} & = \sigma(W_{if}x_{t} + b_{if} + W_{hf}h_{t-1} + b_{hf}) + o_{t} & = \sigma(W_{io}x_{t} + b_{io} + W_{ho}h_{t-1} + b_{ho}) + \\widetilde{c}_{t} & = \\tanh (W_{ig}x_{t} + b_{ig} + W_{hg}h_{t-1} + b_{hg}) + c_{t} & = f_{t} \* c{t-1} + i{t} \* \\widetile{c}_{t} + h_{t} & = o_{t} \* \\tanh(c_{t}) + y_{t} & = h_{t} + + where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise + multiplication operator. Please refer to `An Empirical Exploration of Recurrent Network Architectures `_ for more details. - Parameters: - input_size (int): The input size in the LSTM cell. - hidden_size (int): The hidden size in the LSTM cell. - param_attr(ParamAttr, optional): The parameter attribute for the learnable - weight matrix. Default: None. - bias_attr (ParamAttr, optional): The parameter attribute for the bias - of LSTM. Default: None. - dtype(string, optional): The data type used in this cell. Default float32. + Arguments: + input_size (int): The input size. + hidden_size (int): The hidden size. + weight_ih_attr(ParamAttr, optional): The parameter attribute for + ``weight_ih``. Default: None. + weight_hh_attr(ParamAttr, optional): The parameter attribute for + ``weight_hh``. Default: None. + bias_ih_attr (ParamAttr, optional): The parameter attribute for the + ``bias_ih``. Default: None. + bias_ih_attr (ParamAttr, optional): The parameter attribute for the + ``bias_hh``. Default: None. + name (str, optional): Name for the operation (optional, default is + None). For more information, please refer to :ref:`api_guide_Name`. Examples: .. code-block:: python + import paddle - inputs = paddle.rand((2, 4, 32)) - cell = paddle.LSTMCell(input_size=32, hidden_size=64) - rnn = paddle.RNN(cell=cell) - outputs, _ = rnn(inputs) # [2, 4, 64] + paddle.disable_static() + + x = paddle.randn((4, 16)) + prev_h = paddle.randn((4, 32)) + + cell = paddle.nn.LSTMCell(16, 32) + y, h = cell(x, prev_h) + """ - def __init__(self, input_size, hidden_size, name=None): + def __init__(self, + input_size, + hidden_size, + weight_ih_attr=None, + weight_hh_attr=None, + bias_ih_attr=None, + bias_hh_attr=None, + name=None): super(LSTMCell, self).__init__() std = 1.0 / math.sqrt(hidden_size) self.weight_ih = self.create_parameter( (4 * hidden_size, input_size), + weight_ih_attr, default_initializer=I.Uniform(-std, std)) self.weight_hh = self.create_parameter( (4 * hidden_size, hidden_size), + weight_hh_attr, default_initializer=I.Uniform(-std, std)) self.bias_ih = self.create_parameter( (4 * hidden_size, ), + bias_ih_attr, is_bias=True, default_initializer=I.Uniform(-std, std)) self.bias_hh = self.create_parameter( (4 * hidden_size, ), + bias_hh_attr, is_bias=True, default_initializer=I.Uniform(-std, std)) self.hidden_size = hidden_size self.input_size = input_size - self._gate_activation = layers.sigmoid - self._activation = layers.tanh + self._gate_activation = F.sigmoid + self._activation = paddle.tanh def forward(self, inputs, states=None): """ - Performs single step LSTM calculations. - Parameters: - inputs (Variable): A tensor with shape `[batch_size, input_size]`, - corresponding to :math:`x_t` in the formula. The data type - should be float32 or float64. - states (Variable): A tuple of two tensors, each shaped - `[batch_size, hidden_size]`, corresponding to :math:`h_{t-1}, c_{t-1}` - in the formula. The data type should be float32 or float64. + Given the input and previous atate, compute the output and update state. + + Arguments: + inputs (Tensor): shape `[batch_size, input_size]`, the input, + corresponding to :math:`x_t` in the formula. + states (tuple, optional): a tuple of two tensors, each of shape + `[batch_size, hidden_size]`, the previous hidden state, + corresponding to :math:`h_{t-1}, c_{t-1}` in the formula. + When states is None, zero state is used. Defaults to None. Returns: - tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \ - a tensor with shape `[batch_size, hidden_size]`, corresponding \ - to :math:`h_{t}` in the formula; `new_states` is a list containing \ - two tenser variables shaped `[batch_size, hidden_size]`, corresponding \ - to :math:`h_{t}, c_{t}` in the formula. The data type of these \ - tensors all is same as that of `states`. + (outputs, new_states) + outputs (Tensor): shape `[batch_size, hidden_size]`, the output, + corresponding to :math:`h_{t}` in the formula. + states (tuple): a tuple of two tensors, each of shape + `[batch_size, hidden_size]`, the new hidden states, + corresponding to :math:`h_{t}, c{t}` in the formula. + """ if states is None: states = self.get_initial_states(inputs, self.state_shape) pre_hidden, pre_cell = states - gates = layers.matmul(inputs, self.weight_ih, transpose_y=True) + gates = paddle.matmul(inputs, self.weight_ih, transpose_y=True) if self.bias_ih is not None: gates = gates + self.bias_ih - gates += layers.matmul(pre_hidden, self.weight_hh, transpose_y=True) + gates += paddle.matmul(pre_hidden, self.weight_hh, transpose_y=True) if self.bias_hh is not None: gates = gates + self.bias_hh - chunked_gates = layers.split(gates, num_or_sections=4, dim=-1) + chunked_gates = paddle.split(gates, num_or_sections=4, axis=-1) i = self._gate_activation(chunked_gates[0]) f = self._gate_activation(chunked_gates[1]) @@ -359,74 +503,95 @@ def forward(self, inputs, states=None): @property def state_shape(self): """ - The `state_shape` of BasicLSTMCell is a list with two shapes: `[[hidden_size], [hidden_size]]` - (-1 for batch size would be automatically inserted into shape). These two - shapes correspond to :math:`h_{t-1}` and :math:`c_{t-1}` separately. + The `state_shape` of LSTMCell is a tuple with two shapes: + `((hidden_size, ), (hidden_size,))`. (-1 for batch size would be + automatically inserted into shape). These two shapes correspond + to :math:`h_{t-1}` and :math:`c_{t-1}` separately. """ return ((self.hidden_size, ), (self.hidden_size, )) class GRUCell(RNNCellBase): - """ + r""" Gated Recurrent Unit (GRU) RNN cell. The formula for GRU used is as follows: .. math:: - u_t & = \sigma(W_{x_{u}}x_{t} + b_{x_{u}} + W_{h_{u}}h_{t-1} + b_{h_{u}}) - - r_t & = \sigma(W_{x_{r}}x_{t} + b_{x_{r}} + W_{h_{r}}h_{t-1} + b_{h_{r}}) - - \\tilde{h_t} & = \\tanh(W_{x_{c}}x_{t} + r_t \odot (W_{h_{c}}h_{t-1} + b_{h_{c}}) - - h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t} + r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}x_{t} + b_{hr}) + z_{t} & = \sigma(W_{iz)x_{t} + b_{iz} + W_{hz}x_{t} + b_{hz}) + \\widetilde{h}_{t} & = \\tanh(W_{ic)x_{t} + b_{ic} + r_{t} \* (W_{hc}x_{t} + b{hc})) + h_{t} & = z_{t} \* h_{t-1} + (1 - z_{t}) \* \\widetilde{h}_{t} + y_{t} & = h_{t} + + where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise + multiplication operator. Please refer to `An Empirical Exploration of Recurrent Network Architectures `_ for more details. Parameters: - input_size (int): The input size for the first GRU cell. - hidden_size (int): The hidden size for every GRU cell. - param_attr(ParamAttr, optional): The parameter attribute for the learnable - weight matrix. Default: None. - bias_attr (ParamAttr, optional): The parameter attribute for the bias - of LSTM. Default: None. - dtype(string, optional): The data type used in this cell. Default float32. + input_size (int): The input size.. + hidden_size (int): The hidden size. + weight_ih_attr(ParamAttr, optional): The parameter attribute for + ``weight_ih``. Default: None. + weight_hh_attr(ParamAttr, optional): The parameter attribute for + ``weight_hh``. Default: None. + bias_ih_attr (ParamAttr, optional): The parameter attribute for the + ``bias_ih``. Default: None. + bias_ih_attr (ParamAttr, optional): The parameter attribute for the + ``bias_hh``. Default: None. + name (str, optional): Name for the operation (optional, default is + None). For more information, please refer to :ref:`api_guide_Name`. Examples: - .. code-block:: python import paddle - inputs = paddle.rand((2, 4, 32)) - cell = BasicGRUCell(input_size=32, hidden_size=64) - rnn = RNN(cell=cell) - outputs, _ = rnn(inputs) # [2, 4, 64] + paddle.disable_static() + + x = paddle.randn((4, 16)) + prev_h = paddle.randn((4, 32)) + + cell = paddle.nn.GRUCell(16, 32) + y, h = cell(x, prev_h) + """ - def __init__(self, input_size, hidden_size, name=None): + def __init__(self, + input_size, + hidden_size, + weight_ih_attr=None, + weight_hh_attr=None, + bias_ih_attr=None, + bias_hh_attr=None, + name=None): super(GRUCell, self).__init__() std = 1.0 / math.sqrt(hidden_size) self.weight_ih = self.create_parameter( (3 * hidden_size, input_size), + weight_ih_attr, default_initializer=I.Uniform(-std, std)) self.weight_hh = self.create_parameter( (3 * hidden_size, hidden_size), + weight_hh_attr, default_initializer=I.Uniform(-std, std)) self.bias_ih = self.create_parameter( (3 * hidden_size, ), + bias_ih_attr, is_bias=True, default_initializer=I.Uniform(-std, std)) self.bias_hh = self.create_parameter( (3 * hidden_size, ), + bias_hh_attr, is_bias=True, default_initializer=I.Uniform(-std, std)) self.hidden_size = hidden_size self.input_size = input_size - self._gate_activation = layers.sigmoid - self._activation = layers.tanh + self._gate_activation = F.sigmoid + self._activation = paddle.tanh def forward(self, inputs, states=None): """ @@ -450,15 +615,15 @@ def forward(self, inputs, states=None): states = self.get_initial_states(inputs, self.state_shape) pre_hidden = states - x_gates = layers.matmul(inputs, self.weight_ih, transpose_y=True) + x_gates = paddle.matmul(inputs, self.weight_ih, transpose_y=True) if self.bias_ih is not None: x_gates = x_gates + self.bias_ih - h_gates = layers.matmul(pre_hidden, self.weight_hh, transpose_y=True) + h_gates = paddle.matmul(pre_hidden, self.weight_hh, transpose_y=True) if self.bias_hh is not None: h_gates = h_gates + self.bias_hh - x_r, x_z, x_c = layers.split(x_gates, num_or_sections=3, dim=1) - h_r, h_z, h_c = layers.split(h_gates, num_or_sections=3, dim=1) + x_r, x_z, x_c = paddle.split(x_gates, num_or_sections=3, axis=1) + h_r, h_z, h_c = paddle.split(h_gates, num_or_sections=3, axis=1) r = self._gate_activation(x_r + h_r) z = self._gate_activation(x_z + h_z) @@ -470,7 +635,7 @@ def forward(self, inputs, states=None): @property def state_shape(self): """ - The `state_shape` of BasicGRUCell is a shape `[hidden_size]` (-1 for batch + The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch size would be automatically inserted into shape). The shape corresponds to :math:`h_{t-1}`. """ @@ -479,27 +644,62 @@ def state_shape(self): class RNN(Layer): """ - RNN creates a recurrent neural network specified by RNNCell `cell`, which - performs :code:`cell.forward()` repeatedly until reaches to the maximum - length of `inputs`. + Wrapper for RNN, which creates a recurrent neural network specified with a + RNN cell. It performs :code:`cell.forward()` repeatedly until reaches to + the maximum length of `inputs`. - Parameters: - cell(RNNCell): An instance of `RNNCell`. + Arguments: + cell(RNNCellBase): An instance of `RNNCell`. is_reverse (bool, optional): Indicate whether to calculate in the reverse - order of input sequences. Default: `False`. + order of input sequences. Defaults to False. time_major (bool, optional): Indicate the data layout of Tensor included in `input` and `output` tensors. If `False`, the data layout would - be batch major with shape `[batch_size, sequence_length, ...]`. If + be batch major with shape `[batch_size, time_steps, ...]`. If `True`, the data layout would be time major with shape - `[sequence_length, batch_size, ...]`. Default: `False`. + `[time_steps, batch_size, ...]`. Defaults to False. + + Inputs: + inputs (Tensor): A (possibly nested structure of) tensor variable[s]. + The shape of tensor should be `[batch_size, time_steps, ...]` + for `time_major == False` or `[time_steps, batch_size, ...]` + for `time_major == True`. It represents the inputs to be unrolled + in RNN. + initial_states (Tensor|list|tuple, optional): A (possibly nested structure of) + tensor[s], representing the initial state for the rnn cell. + If not provided, `cell.get_initial_states` would be used to produce + the initial state. Defaults to None. + sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 + or int32. The valid lengths of input sequences. + If `sequence_length` is not None, the inputs are treated as + padded sequences. In each input sequence, elements whos time step + index are not less than the valid length are treated as paddings. + **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`. + + Outputs: + (outputs, final_states) + outputs (Tensor|list|tuple): the output sequence. Tensor or nested + structure of Tensor. + If `time_major` is True, the shape of each tensor in outpus is + `[time_steps, batch_size, hidden_size]`, else + `[batch_size, time_steps, hidden_size]`. + final_states (Tensor|list|tuple): final states. A (possibly nested structure of) + tensor[s], representing the final state for RNN. It has the same + structure of intial state. Each tensor in final states has the same + shape and dtype as the corresponding tensor in initial states. Examples: .. code-block:: python + import paddle - inputs = paddle.rand((2, 4, 32)) - cell = paddle.StackedLSTMCell(input_size=32, hidden_size=64) - rnn = paddle.RNN(cell=cell) - outputs, _ = rnn(inputs) # [2, 4, 64] + paddle.disable_static() + + inputs = paddle.rand((4, 23, 16)) + prev_h = paddle.randn((4, 32)) + + cell = paddle.nn.SimpleRNNCell(16, 32) + rnn = paddle.RNN(cell) + outputs, final_states = rnn(inputs, prev_h) + """ def __init__(self, cell, is_reverse=False, time_major=False): @@ -510,81 +710,85 @@ def __init__(self, cell, is_reverse=False, time_major=False): self.cell.call = self.cell.forward self.is_reverse = is_reverse self.time_major = time_major - # self.batch_index, self.time_step_index = (1, 0) \ - # if time_major else (0, 1) def forward(self, inputs, initial_states=None, sequence_length=None): - """ - Performs :code:`cell.forward()` repeatedly until reaches to the maximum - length of `inputs`. - Parameters: - inputs (Variable): A (possibly nested structure of) tensor variable[s]. - The shape of tensor should be `[batch_size, sequence_length, ...]` - for `time_major == False` or `[sequence_length, batch_size, ...]` - for `time_major == True`. It represents the inputs to be unrolled - in RNN. - initial_states (Variable, optional): A (possibly nested structure of) - tensor variable[s], representing the initial state for RNN. - If not provided, `cell.get_initial_states` would be used to produce - the initial state. Default None. - sequence_length (Variable, optional): A tensor with shape `[batch_size]`. - It stores real length of each instance, thus enables users to extract - the last valid state when past a batch element's sequence length for - correctness. If not provided, the paddings would be treated same as - non-padding inputs. Default None. - **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`. - Returns: - tuple: A tuple( :code:`(final_outputs, final_states)` ) including the final \ - outputs and states, both are Tensor or nested structure of Tensor. \ - `final_outputs` has the same structure and data types as \ - the returned `outputs` of :code:`cell.forward` , and each Tenser in `final_outputs` \ - stacks all time steps' counterpart in `outputs` thus has shape `[batch_size, sequence_length, ...]` \ - for `time_major == False` or `[sequence_length, batch_size, ...]` for `time_major == True`. \ - `final_states` is the counterpart at last time step of initial states, \ - thus has the same structure with it and has tensors with same shapes \ - and data types. - """ - if initial_states is None: initial_states = self.cell.get_initial_states( batch_ref=inputs, dtype=inputs.dtype, batch_dim_idx=self.batch_index) - final_outputs, final_states = layers.rnn( - self.cell, - inputs, - initial_states=initial_states, - sequence_length=sequence_length, - time_major=self.time_major, - is_reverse=self.is_reverse) + final_outputs, final_states = F.rnn(self.cell, + inputs, + initial_states=initial_states, + sequence_length=sequence_length, + time_major=self.time_major, + is_reverse=self.is_reverse) return final_outputs, final_states class BiRNN(Layer): """ - Wrapper for bidirectional RNN. It assembles two RNNCell instances to perform - forward and backward RNN separately, and merge outputs of these two RNN - according to `merge_mode`. + Wrapper for bidirectional RNN. It assembles two RNN cells by performing + forward and backward RNN separately, and concat outputs. + Parameters: - cell_fw (RNNCell): A RNNCell instance used for forward RNN. - cell_bw (RNNCell): A RNNCell instance used for backward RNN. + cell_fw (RNNCellBase): A RNNCell instance used for forward RNN. + cell_bw (RNNCellBase): A RNNCell instance used for backward RNN. + time_major (bool): Whether the first dimension of the input means the + time steps. + + Inputs: + inputs (Tensor): A (possibly nested structure of) tensor variable[s]. + The shape of tensor should be `[batch_size, sequence_length, ...]` + for `time_major == False` or `[sequence_length, batch_size, ...]` + for `time_major == True`. It represents the inputs to be unrolled + in both forward and backward RNN. + initial_states (list|tuple, optional): A tuple of the initial states of + the forward cell and backward cell. + If not provided, `cell.get_initial_states` would be used to produce + the initial states. Defaults to None. + sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 + or int32. The valid lengths of input sequences. + If `sequence_length` is not None, the inputs are treated as + padded sequences. In each input sequence, elements whos time step + index are not less than the valid length are treated as paddings. + **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`. + + Outputs: + outputs (Tensor): A (possibly nested structure of) tensor variable[s], + the outputs of the bidirectional RNN. It is the concatenation + of the outputs for both the forward RNN and backward RNN along + the last axis. + The shape of tensor should be `[batch_size, time_steps, ...]` + for `time_major == False` or `[time_steps, batch_size, ...]` + for `time_major == True`. + final_states (tuple): A tuple of the final states of the forward + cell and backward cell. Examples: .. code-block:: python + import paddle - from paddle.incubate.hapi.text import StackedLSTMCell, BidirectionalRNN - inputs = paddle.rand((2, 4, 32)) - cell_fw = StackedLSTMCell(32, 64) - cell_bw = StackedLSTMCell(32, 64) - bi_rnn = BidirectionalRNN(cell_fw, cell_bw) - outputs, _ = bi_rnn(inputs) # [2, 4, 128] + paddle.disable_static() + + cell_fw = LSTMCell(16, 32) + cell_bw = LSTMCell(16, 32) + rnn = BidirectionalRNN(cell_fw, cell_bw) + + inputs = paddle.rand((2, 23, 16)) + outputs, final_states = rnn(inputs) + """ def __init__(self, cell_fw, cell_bw, time_major=False): super(BiRNN, self).__init__() self.cell_fw = cell_fw self.cell_bw = cell_bw + for cell in [self.cell_fw, self.cell_bw]: + if not hasattr(cell, "call"): + # for non-dygraph mode, `rnn` api uses cell.call + cell.call = cell.forward self.time_major = time_major def forward(self, @@ -592,131 +796,47 @@ def forward(self, initial_states=None, sequence_length=None, **kwargs): - """ - Performs forward and backward RNN separately, and merge outputs of these - two RNN according to `merge_mode`. - Parameters: - inputs (Variable): A (possibly nested structure of) tensor variable[s]. - The shape of tensor should be `[batch_size, sequence_length, ...]` - for `time_major == False` or `[sequence_length, batch_size, ...]` - for `time_major == True`. It represents the inputs to be unrolled - in both forward and backward RNN. - initial_states (Variable|list|tuple): If it is a list or tuple, its - length should be 2 to include initial states of forward and backward - RNN separately. Otherwise it would be used twice for the two RNN. - If None, `cell.get_initial_states` would be used to produce the initial - states. Default None. - sequence_length (Variable, optional): A tensor with shape `[batch_size]`. - It stores real length of each instance, thus enables users to extract - the last valid state when past a batch element's sequence length for - correctness. If not provided, the paddings would be treated same as - non-padding inputs. Default None. - **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`. - Returns: - tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \ - is produced by merge outputs of forward and backward RNN according \ - to `merge_mode`; similarly, `final_states` is produced by merge \ - `final_states` of forward and backward RNN. - """ if isinstance(initial_states, (list, tuple)): assert len(initial_states) == 2, \ "length of initial_states should be 2 when it is a list/tuple" else: initial_states = [initial_states, initial_states] - states_fw, states_bw = initial_states - outputs, final_states = birnn(self.cell_fw, self.cell_bw, inputs, - states_fw, states_bw, sequence_length, - self.time_major) + outputs, final_states = F.birnn(self.cell_fw, self.cell_bw, inputs, + initial_states, sequence_length, + self.time_major) return outputs, final_states - @staticmethod - def bidirect_param_attr(param_attr): - """ - Converts `param_attr` to a pair of `param_attr` when it is not a list - or tuple with length 2, also rename every one by appending a suffix to - avoid having same names when `param_attr` contains a name. - - Parameters: - param_attr (list|tuple|ParamAttr): A list, tuple or something can be - converted to a ParamAttr instance by `ParamAttr._to_attr`. When - it is a list or tuple, its length must be 2. - - Returns: - list: A pair composed of forward and backward RNN cell's `param_attr`. - """ - if isinstance(param_attr, (list, tuple)): - assert len( - param_attr - ) == 2, "length of param_attr should be 2 when it is a list/tuple" - param_attrs = param_attr - else: - param_attrs = [] - attr = ParamAttr._to_attr(param_attr) - attr_fw = copy.deepcopy(attr) - if attr.name: - attr_fw.name = attr_fw.name + "_fw" - param_attrs.append(attr_fw) - attr_bw = copy.deepcopy(attr) - if attr.name: - attr_bw.name = attr_bw.name + "_bw" - param_attrs.append(attr_bw) - return param_attrs - - -class SimpleRNN(LayerList): - def __init__(self, - input_size, - hidden_size, - num_layers=1, - nonlinearity="tanh", - direction="forward", - dropout=0., - time_major=False, - name=None): - super(SimpleRNN, self).__init__() - - if direction in ["forward", "backward"]: - is_reverse = direction == "backward" - cell = SimpleRNNCell(input_size, hidden_size, nonlinearity) - self.append(RNN(cell, is_reverse, time_major)) - for i in range(1, num_layers): - cell = SimpleRNNCell(hidden_size, hidden_size, nonlinearity) - self.append(RNN(cell, is_reverse, time_major)) - elif direction == "bidirectional": - cell_fw = SimpleRNNCell(input_size, hidden_size, nonlinearity) - cell_bw = SimpleRNNCell(input_size, hidden_size, nonlinearity) - self.append(BiRNN(cell_fw, cell_bw, time_major)) - for i in range(1, num_layers): - cell_fw = SimpleRNNCell(2 * hidden_size, hidden_size, - nonlinearity) - cell_bw = SimpleRNNCell(2 * hidden_size, hidden_size, - nonlinearity) - self.append(BiRNN(cell_fw, cell_bw, time_major)) - else: - raise ValueError( - "direction should be forward, backward or bidirectional, " - "received direction = {}".format(direction)) - self.dropout = dropout - self.num_directions = 2 if direction == "bidirectional" else 1 - self.time_major = time_major - self.num_layers = num_layers +class RNNMixin(LayerList): + r""" + A Mixin class for RNN networks. It provides forward method for SimpleRNN, + LSTM and GRU. + """ def forward(self, inputs, initial_states=None, sequence_length=None): batch_index = 1 if self.time_major else 0 - batch_size = inputs.shape[batch_index] if fluid.in_dygraph_mode() \ - else layers.shape(inputs)[batch_index] + dtype = inputs.dtype if initial_states is None: - state_shape = (self.num_directions * self.num_layers, batch_size, + state_shape = (self.num_layers * self.num_directions, -1, self.hidden_size) - initial_states = layers.zeros(state_shape, dtype=inputs.dtype) - - states = split_states(initial_states, self.num_directions == 2) + if self.state_components == 1: + initial_states = paddle.fluid.layers.fill_constant_batch_size_like( + inputs, state_shape, dtype, 0, batch_index, 1) + else: + initial_states = tuple([ + paddle.fluid.layers.fill_constant_batch_size_like( + inputs, state_shape, dtype, 0, batch_index, 1) + for _ in range(self.state_components) + ]) + + states = split_states(initial_states, self.num_directions == 2, + self.state_components) final_states = [] + for i, rnn_layer in enumerate(self): if i > 0: - inputs = layers.dropout( + inputs = F.dropout( inputs, self.dropout, dropout_implementation="upscale_in_train") @@ -724,74 +844,225 @@ def forward(self, inputs, initial_states=None, sequence_length=None): final_states.append(final_state) inputs = outputs - final_states = concat_states(final_states, self.num_directions == 2) + final_states = concat_states(final_states, self.num_directions == 2, + self.state_components) return outputs, final_states -class LSTM(LayerList): +class SimpleRNN(RNNMixin): + r""" + Multilayer Elman network(SimpleRNN). It takes a sequence and an initial + state as inputs, and returns the output sequence and the final state. + + Each layer inside the SimpleRNN maps the input sequence and initial state + to the output sequence and final state in the following manner: at each + step, it takes step input(:math:`x_{t}`) and previous + state(:math:`h_{t-1}`) as inputs, and returns step output(:math:`y_{t}`) + and new state(:math:`h_{t}`). + + .. math:: + + h_{t} & = \mathrm{tanh}(W_{ih}x_{t} + b_{ih} + W_{hh}h{t-1} + b_{hh}) + y_{t} & = h_{t} + + where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise + multiplication operator. + + Arguments: + input_size (int): The input size for the first layer's cell. + hidden_size (int): The hidden size for each layer's cell. + num_layers (int): Number of layers. Defaults to 1. + nonlinearity (str): The activation in each SimpleRNN cell. It can be + ``tanh`` or ``relu``. Defaults to ``tanh``. + direction (str): The direction of the network. It can be "forward", + "backward" and "bidirectional". Defaults to "forward". + dropout (float): The droput probability. Dropout is applied to the + input of each layer except for the first layer. + time_major (bool): Whether the first dimension of the input means the + time steps. + weight_ih_attr (ParamAttr, optional): The parameter attribute for + ``weight_ih`` of each cell. Default: None. + weight_hh_attr (ParamAttr, optional): The parameter attribute for + ``weight_hh`` of each cell. Default: None. + bias_ih_attr (ParamAttr, optional): The parameter attribute for the + ``bias_ih`` of each cells. Default: None. + bias_ih_attr (ParamAttr, optional): The parameter attribute for the + ``bias_hh`` of each cells. Default: None. + name (str, optional): Name for the operation (optional, default is + None). For more information, please refer to :ref:`api_guide_Name`. + + Inputs: + inputs (Tensor): the input sequence. + If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`, + else, the shape is `[batch_size, time_steps, hidden_size]`. + initial_states (Tensor, optional): the initial state. The shape is + `[num_lauers * num_directions, batch_size, hidden_size]`. + If initial_state is not given, zero initial states are used. + sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 + or int32. The valid lengths of input sequences. + If `sequence_length` is not None, the inputs are treated as + padded sequences. In each input sequence, elements whos time step + index are not less than the valid length are treated as paddings. + + Outputs: + (outputs, final_states) + outputs (Tensor): the output sequence. + If `time_major` is True, the shape is `[time_steps, batch_size, hidden_size]`, + else, the shape is `[batch_size, time_steps, hidden_size]`. + final_states (Tensor): final states. The shape is + `[num_lauers * num_directions, batch_size, hidden_size]`. + + Examples: + .. code-block:: python + + import paddle + paddle.disable_static() + + rnn = paddle.nn.SimpleRNN(16, 32, 2) + + x = paddle.randn((4, 23, 16)) + prev_h = paddle.randn((2, 4, 32)) + y, h = rnn(x, prev_h) + + """ + def __init__(self, input_size, hidden_size, num_layers=1, + nonlinearity="tanh", direction="forward", dropout=0., time_major=False, + weight_ih_attr=None, + weight_hh_attr=None, + bias_ih_attr=None, + bias_hh_attr=None, name=None): - super(LSTM, self).__init__() + super(SimpleRNN, self).__init__() if direction in ["forward", "backward"]: is_reverse = direction == "backward" - cell = LSTMCell(input_size, hidden_size) + cell = SimpleRNNCell(input_size, hidden_size, nonlinearity, + weight_ih_attr, weight_hh_attr, bias_ih_attr, + bias_hh_attr) self.append(RNN(cell, is_reverse, time_major)) for i in range(1, num_layers): - cell = LSTMCell(hidden_size, hidden_size) + cell = SimpleRNNCell(hidden_size, hidden_size, nonlinearity, + weight_ih_attr, weight_hh_attr, + bias_ih_attr, bias_hh_attr) self.append(RNN(cell, is_reverse, time_major)) elif direction == "bidirectional": - cell_fw = LSTMCell(input_size, hidden_size) - cell_bw = LSTMCell(input_size, hidden_size) + cell_fw = SimpleRNNCell(input_size, hidden_size, nonlinearity, + weight_ih_attr, weight_hh_attr, + bias_ih_attr, bias_hh_attr) + cell_bw = SimpleRNNCell(input_size, hidden_size, nonlinearity, + weight_ih_attr, weight_hh_attr, + bias_ih_attr, bias_hh_attr) self.append(BiRNN(cell_fw, cell_bw, time_major)) for i in range(1, num_layers): - cell_fw = LSTMCell(2 * hidden_size, hidden_size) - cell_bw = LSTMCell(2 * hidden_size, hidden_size) + cell_fw = SimpleRNNCell( + 2 * hidden_size, hidden_size, nonlinearity, weight_ih_attr, + weight_hh_attr, bias_ih_attr, bias_hh_attr) + cell_bw = SimpleRNNCell( + 2 * hidden_size, hidden_size, nonlinearity, weight_ih_attr, + weight_hh_attr, bias_ih_attr, bias_hh_attr) self.append(BiRNN(cell_fw, cell_bw, time_major)) else: raise ValueError( "direction should be forward, backward or bidirectional, " "received direction = {}".format(direction)) + self.input_size = input_size + self.hidden_size = hidden_size self.dropout = dropout self.num_directions = 2 if direction == "bidirectional" else 1 self.time_major = time_major self.num_layers = num_layers + self.state_components = 1 - def forward(self, inputs, initial_states=None, sequence_length=None): - batch_index = 1 if self.time_major else 0 - batch_size = inputs.shape[batch_index] if fluid.in_dygraph_mode() \ - else layers.shape(inputs)[batch_index] - if initial_states is None: - state_shape = (self.num_directions * self.num_layers, batch_size, - self.hidden_size) - init_h = layers.zeros(state_shape, dtype=inputs.dtype) - init_c = layers.zeros(state_shape, dtype=inputs.dtype) - initial_states = (init_h, init_c) - states = split_states(initial_states, self.num_directions == 2, 2) - final_states = [] - for i, rnn_layer in enumerate(self): - if i > 0: - inputs = layers.dropout( - inputs, - self.dropout, - dropout_implementation="upscale_in_train") - outputs, final_state = rnn_layer(inputs, states[i], sequence_length) - final_states.append(final_state) - inputs = outputs +class LSTM(RNNMixin): + r""" + Multilayer LSTM. It takes a sequence and an initial state as inputs, and + returns the output sequence and the final state. - final_states = concat_states(final_states, self.num_directions == 2, 2) - return outputs, final_states + Each layer inside the LSTM maps the input sequence and initial state + to the output sequence and final state in the following manner: at each + step, it takes step input(:math:`x_{t}`) and previous + state(:math:`h_{t-1}, c_{t-1}`) as inputs, and returns step + output(:math:`y_{t}`) and new state(:math:`h_{t}, c_{t}`). + .. math:: + + i_{t} & = \sigma(W_{ii}x_{t} + b_{ii} + W_{hi}h_{t-1} + b_{hi}) + f_{t} & = \sigma(W_{if}x_{t} + b_{if} + W_{hf}h_{t-1} + b_{hf}) + o_{t} & = \sigma(W_{io}x_{t} + b_{io} + W_{ho}h_{t-1} + b_{ho}) + \\widetilde{c}_{t} & = \\tanh (W_{ig}x_{t} + b_{ig} + W_{hg}h_{t-1} + b_{hg}) + c_{t} & = f_{t} \* c{t-1} + i{t} \* \\widetile{c}_{t} + h_{t} & = o_{t} \* \\tanh(c_{t}) + y_{t} & = h_{t} + + where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise + multiplication operator. + + Arguments: + input_size (int): The input size for the first layer's cell. + hidden_size (int): The hidden size for each layer's cell. + num_layers (int): Number of layers. Defaults to 1. + direction (str): The direction of the network. It can be "forward", + "backward" and "bidirectional". Defaults to "forward". + dropout (float): The droput probability. Dropout is applied to the + input of each layer except for the first layer. + time_major (bool): Whether the first dimension of the input means the + time steps. + weight_ih_attr (ParamAttr, optional): The parameter attribute for + ``weight_ih`` of each cell. Default: None. + weight_hh_attr (ParamAttr, optional): The parameter attribute for + ``weight_hh`` of each cell. Default: None. + bias_ih_attr (ParamAttr, optional): The parameter attribute for the + ``bias_ih`` of each cells. Default: None. + bias_ih_attr (ParamAttr, optional): The parameter attribute for the + ``bias_hh`` of each cells. Default: None. + name (str, optional): Name for the operation (optional, default is + None). For more information, please refer to :ref:`api_guide_Name`. + + Inputs: + inputs (Tensor): the input sequence. + If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`, + else, the shape is `[batch_size, time_steps, hidden_size]`. + initial_states (tuple, optional): the initial state, a tuple of (h, c), + the shape of each is `[num_lauers * num_directions, batch_size, hidden_size]`. + If initial_state is not given, zero initial states are used. + sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 + or int32. The valid lengths of input sequences. + If `sequence_length` is not None, the inputs are treated as + padded sequences. In each input sequence, elements whos time step + index are not less than the valid length are treated as paddings. + + Outputs: + (outputs, final_states) + outputs (Tensor): the output sequence. + If `time_major` is True, the shape is `[time_steps, batch_size, hidden_size]`, + else, the shape is `[batch_size, time_steps, hidden_size]`. + final_states (Tensor): the final state, a tuple of (h, c), + the shape of each is `[num_lauers * num_directions, batch_size, hidden_size]`. + + Examples: + .. code-block:: python + + import paddle + paddle.disable_static() + + rnn = paddle.nn.LSTM(16, 32, 2) + + x = paddle.randn((4, 23, 16)) + prev_h = paddle.randn((2, 4, 32)) + prev_c = paddle.randn((2, 4, 32)) + y, (h, c) = rnn(x, (prev_h, prev_c)) + + """ -class GRU(LayerList): def __init__(self, input_size, hidden_size, @@ -799,522 +1070,170 @@ def __init__(self, direction="forward", dropout=0., time_major=False, + weight_ih_attr=None, + weight_hh_attr=None, + bias_ih_attr=None, + bias_hh_attr=None, name=None): - super(GRU, self).__init__() + super(LSTM, self).__init__() if direction in ["forward", "backward"]: is_reverse = direction == "backward" - cell = GRUCell(input_size, hidden_size) + cell = LSTMCell(input_size, hidden_size, weight_ih_attr, + weight_hh_attr, bias_ih_attr, bias_hh_attr) self.append(RNN(cell, is_reverse, time_major)) for i in range(1, num_layers): - cell = GRUCell(hidden_size, hidden_size) + cell = LSTMCell(hidden_size, hidden_size, weight_ih_attr, + weight_hh_attr, bias_ih_attr, bias_hh_attr) self.append(RNN(cell, is_reverse, time_major)) elif direction == "bidirectional": - cell_fw = GRUCell(input_size, hidden_size) - cell_bw = GRUCell(input_size, hidden_size) + cell_fw = LSTMCell(input_size, hidden_size, weight_ih_attr, + weight_hh_attr, bias_ih_attr, bias_hh_attr) + cell_bw = LSTMCell(input_size, hidden_size, weight_ih_attr, + weight_hh_attr, bias_ih_attr, bias_hh_attr) self.append(BiRNN(cell_fw, cell_bw, time_major)) for i in range(1, num_layers): - cell_fw = GRUCell(2 * hidden_size, hidden_size) - cell_bw = GRUCell(2 * hidden_size, hidden_size) + cell_fw = LSTMCell(2 * hidden_size, hidden_size, weight_ih_attr, + weight_hh_attr, bias_ih_attr, bias_hh_attr) + cell_bw = LSTMCell(2 * hidden_size, hidden_size, weight_ih_attr, + weight_hh_attr, bias_ih_attr, bias_hh_attr) self.append(BiRNN(cell_fw, cell_bw, time_major)) else: raise ValueError( "direction should be forward, backward or bidirectional, " "received direction = {}".format(direction)) + self.input_size = input_size + self.hidden_size = hidden_size self.dropout = dropout self.num_directions = 2 if direction == "bidirectional" else 1 self.time_major = time_major self.num_layers = num_layers + self.state_components = 2 - def forward(self, inputs, initial_states=None, sequence_length=None): - batch_index = 1 if self.time_major else 0 - batch_size = inputs.shape[batch_index] if fluid.in_dygraph_mode() \ - else layers.shape(inputs)[batch_index] - if initial_states is None: - state_shape = (self.num_directions * self.num_layers, batch_size, - self.hidden_size) - initial_states = layers.zeros(state_shape, dtype=inputs.dtype) - states = split_states(initial_states, self.num_directions == 2) - - final_states = [] - for i, rnn_layer in enumerate(self): - if i > 0: - inputs = layers.dropout( - inputs, - self.dropout, - dropout_implementation="upscale_in_train") - outputs, final_state = rnn_layer(inputs, states[i], sequence_length) - final_states.append(final_state) - inputs = outputs - final_states = concat_states(final_states, self.num_directions == 2) - return outputs, final_states +class GRU(RNNMixin): + r""" + Multilayer GRU. It takes a sequence and an initial state as inputs, and + returns the output sequence and the final state. + Each layer inside the GRU maps the input sequence and initial state + to the output sequence and final state in the following manner: at each + step, it takes step input(:math:`x_{t}`) and previous + state(:math:`h_{t-1}`) as inputs, and returns step output(:math:`y_{t}`) + and new state(:math:`h_{t}`). -# class LSTM(Layer): -# """ -# Applies a stacked multi-layer long short-term memory (LSTM) RNN to an input -# sequence. - -# The formula for LSTM used here is as follows: - -# .. math:: -# i_{t} & = \sigma(W_{x_{i}}x_{t} + b_{x_{i}} + W_{h_{i}}h_{t-1} + b_{h_{i}}) -# f_{t} & = \sigma(W_{x_{f}}x_{t} + b_{x_{f}} + W_{h_{f}}h_{t-1} + b_{h_{f}}) -# o_{t} & = \sigma(W_{x_{o}}x_{t} + b_{x_{o}} + W_{h_{o}}h_{t-1} + b_{h_{o}}) -# c_{t} & = f_{t}c_{t-1} + i_{t} \\tanh (W_{x_{c}}x_{t} + b_{x_{c}} + W_{h_{c}}h_{t-1} + b_{h_{c}}) -# h_{t} & = o_{t} \\tanh (c_{t}) - -# Parameters: -# input_size (int): The input feature size for the first LSTM. -# hidden_size (int): The hidden size for every LSTM. -# num_layers(int, optional): The number of LSTM to be stacked. Default 1. -# dropout(float, optional): The dropout probability applied on the outputs -# of each LSTM except the last one. 0 for not dropout. Default 0.0 -# direction (str, optional): Indicate the direction for LSTM calculation -# applying on the input sequences. It can be `forward`, `backward` or -# `bidirect`. If it is `backward`, calculate in the reverse order of -# input sequences. If it is `bidirect`, each layer would be a -# bidirectional LSTM composed of a `forward` LSTM and `backward` LSTM, -# and it concatenates their outputs as outputs. Default: `forward`. -# time_major (bool, optional): Indicate the data layout of Tensor included -# in `input` and `output` tensors. If `False`, the data layout would -# be batch major with shape `[batch_size, sequence_length, ...]`. If -# `True`, the data layout would be time major with shape -# `[sequence_length, batch_size, ...]`. Default: `False`. -# param_attr (list|tuple|ParamAttr): A list, tuple or something can be -# converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is -# a list or tuple, it's length must equal to `num_layers`. Otherwise, -# construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`. -# Default None. -# bias_attr (list|tuple|ParamAttr): A list, tuple or something can be -# converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is -# a list or tuple, it's length must equal to `num_layers`. Otherwise, -# construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`. -# Default None. -# dtype(string, optional): The data type used in this cell. It can be -# float32 or float64. Default float32. -# Examples: -# .. code-block:: python -# import paddle -# import paddle.fluid as fluid -# from paddle.incubate.hapi.text import LSTM -# inputs = paddle.rand((2, 4, 32)) -# lstm = LSTM(input_size=32, hidden_size=64, num_layers=2) -# outputs, _ = lstm(inputs) # [2, 4, 64] -# """ - -# def __init__(self, -# input_size, -# hidden_size, -# num_layers=1, -# direction="forward", -# dropout=0.0, -# time_major=False, -# name=None): -# super(LSTM, self).__init__() -# self.input_size = input_size -# self.hidden_size = hidden_size -# self.num_layers = num_layers -# self.dropout = dropout -# self.direction = direction -# self.num_directions = 2 if direction == 'bidirect' else 1 -# self.time_major = time_major - -# if direction == 'bidirect': -# param_attrs = BidirectionalRNN.bidirect_param_attr(param_attr) -# bias_attrs = BidirectionalRNN.bidirect_param_attr(bias_attr) -# fw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[0], -# num_layers) -# bw_param_attrs = StackedRNNCell.stack_param_attr(param_attrs[1], -# num_layers) -# fw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[0], -# num_layers) -# bw_bias_attrs = StackedRNNCell.stack_param_attr(bias_attrs[1], -# num_layers) - -# # maybe design cell including both forward and backward later -# merge_mode = 'concat' -# rnns = [] -# for i in range(num_layers): -# cell_fw = StackedLSTMCell(input_size if i == 0 else ( -# hidden_size * 2 if merge_mode == 'concat' else -# hidden_size), hidden_size, 1, dropout, fw_param_attrs[i], -# fw_bias_attrs[i], dtype) -# cell_bw = StackedLSTMCell(input_size if i == 0 else ( -# hidden_size * 2 if merge_mode == 'concat' else -# hidden_size), hidden_size, 1, dropout, bw_param_attrs[i], -# bw_bias_attrs[i], dtype) -# rnns.append( -# BidirectionalRNN( -# cell_fw, -# cell_bw, -# merge_mode=merge_mode, -# time_major=time_major)) -# self.lstm = LayerList(rnns) -# else: -# lstm_cell = StackedLSTMCell(input_size, hidden_size, num_layers, -# dropout, param_attr, bias_attr, dtype) -# self.lstm = RNN(lstm_cell, -# is_reverse=(direction == "backward"), -# time_major=time_major) - -# def forward(self, input, initial_states=None, sequence_length=None): -# """ -# Performs the stacked multi-layer LSTM layer by layer. Each LSTM's `outputs` -# is the `inputs` of the subsequent one. -# Parameters: -# inputs (Variable): The inputs for the first LSTM. It is a float32 -# or float64 tensor shaped `[batch_size, sequence_length, input_size]`. -# initial_states (list|None, optional): A list containing initial states -# of all stacked LSTM, and the initial states of each LSTM is a pair -# of tensors shaped `[batch_size, hidden_size]`. If not provided, -# use 0 as initial states. Default None. -# sequence_length (Variable, optional): A tensor with shape `[batch_size]`. -# It stores real length of each instance, thus enables users to extract -# the last valid state when past a batch element's sequence length for -# correctness. If not provided, the paddings would be treated same as -# non-padding inputs. Default None. -# Returns: -# tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \ -# is the output of last LSTM and it is a tensor with shape \ -# `[batch_size, sequence_length, hidden_size]` and has the same \ -# data type as `inputs`, `final_states` is the counterpart of \ -# `initial_states` at last time step, thus has the same structure \ -# with it and has tensors with same shapes data types. -# """ -# if not isinstance(self.lstm, LayerList): -# return self.lstm(input, initial_states, sequence_length) -# else: -# if isinstance(initial_states, (list, tuple)): -# assert len(initial_states) == self.num_layers, ( -# "length of initial_states should be %d when it is a list|tuple" -# % self.num_layers) -# else: -# initial_states = [initial_states] * self.num_layers -# stacked_states = [] -# for i in range(self.num_layers): -# output, states = self.lstm[i](input, initial_states[i], -# sequence_length) -# input = output -# stacked_states.append(states) -# return output, stacked_states - - -# TODO: restucture RNN layers -class StackedRNNCell(RNNCellBase): - """ - Wrapper allowing a stack of RNN cells to behave as a single cell. It is used - to implement stacked RNNs. + .. math:: - Parameters: - cells (list|tuple): List of RNN cell instances. + r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}x_{t} + b_{hr}) + z_{t} & = \sigma(W_{iz)x_{t} + b_{iz} + W_{hz}x_{t} + b_{hz}) + \\widetilde{h}_{t} & = \\tanh(W_{ic)x_{t} + b_{ic} + r_{t} \* (W_{hc}x_{t} + b{hc})) + h_{t} & = z_{t} \* h_{t-1} + (1 - z_{t}) \* \\widetilde{h}_{t} + y_{t} & = h_{t} + + where :math:`\sigma` is the sigmoid fucntion, and \* is the elemetwise + multiplication operator. + + Arguments: + input_size (int): The input size for the first layer's cell. + hidden_size (int): The hidden size for each layer's cell. + num_layers (int): Number of layers. Defaults to 1. + direction (str): The direction of the network. It can be "forward", + "backward" and "bidirectional". Defaults to "forward". + dropout (float): The droput probability. Dropout is applied to the + input of each layer except for the first layer. + time_major (bool): Whether the first dimension of the input means the + time steps. + weight_ih_attr (ParamAttr, optional): The parameter attribute for + ``weight_ih`` of each cell. Default: None. + weight_hh_attr (ParamAttr, optional): The parameter attribute for + ``weight_hh`` of each cell. Default: None. + bias_ih_attr (ParamAttr, optional): The parameter attribute for the + ``bias_ih`` of each cells. Default: None. + bias_ih_attr (ParamAttr, optional): The parameter attribute for the + ``bias_hh`` of each cells. Default: None. + name (str, optional): Name for the operation (optional, default is + None). For more information, please refer to :ref:`api_guide_Name`. + + Inputs: + inputs (Tensor): the input sequence. + If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`, + else, the shape is `[batch_size, time_steps, hidden_size]`. + initial_states (Tensor, optional): the initial state. The shape is + `[num_lauers * num_directions, batch_size, hidden_size]`. + If initial_state is not given, zero initial states are used. + sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 + or int32. The valid lengths of input sequences. + If `sequence_length` is not None, the inputs are treated as + padded sequences. In each input sequence, elements whos time step + index are not less than the valid length are treated as paddings. + + Outputs: + (outputs, final_states) + outputs (Tensor): the output sequence. + If `time_major` is True, the shape is `[time_steps, batch_size, hidden_size]`, + else, the shape is `[batch_size, time_steps, hidden_size]`. + final_states (Tensor): final states. The shape is + `[num_lauers * num_directions, batch_size, hidden_size]`. Examples: .. code-block:: python - from paddle import LSTMCell, StackedRNNCell - cells = [LSTMCell(32, 32), LSTMCell(32, 32)] - stack_rnn = StackedRNNCell(cells) - """ - - def __init__(self, cells): - super(StackedRNNCell, self).__init__() - self.cells = LayerList(cells) - - def forward(self, inputs, states): - """ - Performs :code:`cell.forward` for all including cells sequentially. - Each cell's `inputs` is the `outputs` of the previous cell. And each - cell's `states` is the corresponding one in `states`. - Parameters: - inputs (Variable): The inputs for the first cell. Mostly it is a - float32 or float64 tensor with shape `[batch_size, input_size]`. - states (list): A list containing states for all cells orderly. - - Returns: - tuple: A tuple( :code:`(outputs, new_states)` ). `outputs` is the \ - `outputs` of the last cell. `new_states` is a list composed \ - of all cells' `new_states`, and its structure and data type is \ - same as that of `states` argument. - """ - new_states = [] - for cell, state in zip(self.cells, states): - outputs, new_state = cell(inputs, state) - inputs = outputs - new_states.append(new_state) - return outputs, new_states - - @staticmethod - def stack_param_attr(param_attr, n): - """ - If `param_attr` is a list or tuple, convert every element in it to a - ParamAttr instance. Otherwise, repeat `param_attr` `n` times to - construct a list, and rename every one by appending a increasing index - suffix to avoid having same names when `param_attr` contains a name. - Parameters: - param_attr (list|tuple|ParamAttr): A list, tuple or something can be - converted to a ParamAttr instance by `ParamAttr._to_attr`. - n (int): The times to repeat to construct a list when `param_attr` - is not a list or tuple. - Returns: - list: A list composed of each including cell's `param_attr`. - """ - if isinstance(param_attr, (list, tuple)): - assert len(param_attr) == n, ( - "length of param_attr should be %d when it is a list/tuple" % n) - param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr] - else: - param_attrs = [] - attr = ParamAttr._to_attr(param_attr) - for i in range(n): - attr_i = copy.deepcopy(attr) - if attr.name: - attr_i.name = attr_i.name + "_" + str(i) - param_attrs.append(attr_i) - return param_attrs - - @property - def state_shape(self): - """ - The `state_shape` of StackedRNNCell is a list composed of each including - cell's `state_shape`. - Returns: - list: A list composed of each including cell's `state_shape`. - """ - return [cell.state_shape for cell in self.cells] - - -class StackedLSTMCell(RNNCellBase): - """ - Wrapper allowing a stack of LSTM cells to behave as a single cell. It is used - to implement stacked LSTM. - - The formula for LSTM used here is as follows: + import paddle + paddle.disable_static() - .. math:: - i_{t} & = \sigma(W_{x_{i}}x_{t} + b_{x_{i}} + W_{h_{i}}h_{t-1} + b_{h_{i}}) - f_{t} & = \sigma(W_{x_{f}}x_{t} + b_{x_{f}} + W_{h_{f}}h_{t-1} + b_{h_{f}}) - o_{t} & = \sigma(W_{x_{o}}x_{t} + b_{x_{o}} + W_{h_{o}}h_{t-1} + b_{h_{o}}) - c_{t} & = f_{t}c_{t-1} + i_{t} \\tanh (W_{x_{c}}x_{t} + b_{x_{c}} + W_{h_{c}}h_{t-1} + b_{h_{c}}) - h_{t} & = o_{t} \\tanh (c_{t}) + rnn = paddle.nn.GRU(16, 32, 2) - Parameters: - input_size (int): The input size for the first LSTM cell. - hidden_size (int): The hidden size for every LSTM cell. - num_layers(int, optional): The number of LSTM to be stacked. Default 1. - dropout(float, optional): The dropout probability applied on the outputs - of each LSTM cell except the last one. 0 for no dropout. Default 0.0 - param_attr (list|tuple|ParamAttr): A list, tuple or something can be - converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is - a list or tuple, it's length must equal to `num_layers`. Otherwise, - construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`. - Default None. - bias_attr (list|tuple|ParamAttr): A list, tuple or something can be - converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is - a list or tuple, it's length must equal to `num_layers`. Otherwise, - construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`. - Default None. - dtype(string, optional): The data type used in this cell. It can be - float32 or float64. Default float32. + x = paddle.randn((4, 23, 16)) + prev_h = paddle.randn((2, 4, 32)) + y, h = rnn(x, prev_h) - Examples: - .. code-block:: python - import paddle - inputs = paddle.rand((2, 4, 32)) - cell = paddle.StackedLSTMCell(input_size=32, hidden_size=64) - rnn = paddle.RNN(cell=cell) - outputs, _ = rnn(inputs) # [2, 4, 64] """ def __init__(self, input_size, hidden_size, num_layers=1, - dropout=0.0, - param_attr=None, - bias_attr=None, - dtype="float32"): - super(StackedLSTMCell, self).__init__() - self.hidden_size = hidden_size - self.input_size = input_size - self.num_layers = num_layers - self.dropout = dropout - param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers) - bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers) - - self.cells = [] - for i in range(num_layers): - self.cells.append( - self.add_sublayer( - "lstm_%d" % i, - LSTMCell( - input_size=input_size if i == 0 else hidden_size, - hidden_size=hidden_size, - param_attr=param_attrs[i], - bias_attr=bias_attrs[i], - dtype=dtype))) - - def forward(self, inputs, states): - """ - Performs the stacked LSTM cells sequentially. Each cell's `inputs` is - the `outputs` of the previous cell. And each cell's `states` is the - corresponding one in `states`. - - Parameters: - inputs (Variable): The inputs for the first cell. It is a float32 or - float64 tensor with shape `[batch_size, input_size]`. - states (list): A list containing states for all cells orderly. - - Returns: - tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \ - a tensor with shape `[batch_size, hidden_size]`, corresponding \ - to :math:`h_{t}` in the formula of the last LSTM; `new_states` \ - is a list composed of every LSTM `new_states` which is a pair \ - of tensors standing for :math:`h_{t}, c_{t}` in the formula, \ - and the data type and structure of these tensors all is same \ - as that of `states`. - """ - new_states = [] - for i, cell in enumerate(self.cells): - outputs, new_state = cell(inputs, states[i]) - outputs = layers.dropout( - outputs, - self.dropout, - dropout_implementation='upscale_in_train' - ) if self.dropout and i != (self.num_layers - 1) else outputs - inputs = outputs - new_states.append(new_state) - # TODO(guosheng): maybe should stack list of states as one tensor - return outputs, new_states - - @property - def state_shape(self): - """ - The `state_shape` of StackedLSTMCell is a list composed of each including - LSTM cell's `state_shape`. - Returns: - list: A list composed of each including LSTM cell's `state_shape`. - """ - return [cell.state_shape for cell in self.cells] - - -class StackedGRUCell(RNNCellBase): - """ - Wrapper allowing a stack of GRU cells to behave as a single cell. It is used - to implement stacked GRU. - - The formula for GRU used here is as follows: - - .. math:: - - u_t & = \sigma(W_{x_{u}}x_{t} + b_{x_{u}} + W_{h_{u}}h_{t-1} + b_{h_{u}}) - - r_t & = \sigma(W_{x_{r}}x_{t} + b_{x_{r}} + W_{h_{r}}h_{t-1} + b_{h_{r}}) - - \\tilde{h_t} & = \\tanh(W_{x_{c}}x_{t} + r_t \odot (W_{h_{c}}h_{t-1} + b_{h_{c}}) - - h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t} - - - Parameters: - input_size (int): The input size for the first GRU cell. - hidden_size (int): The hidden size for every GRU cell. - num_layers(int, optional): The number of GRU to be stacked. Default 1. - dropout(float, optional): The dropout probability applied on the outputs - of each GRU cell except the last one. 0 for no dropout. Default 0.0 - param_attr (list|tuple|ParamAttr): A list, tuple or something can be - converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is - a list or tuple, it's length must equal to `num_layers`. Otherwise, - construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`. - Default None. - bias_attr (list|tuple|ParamAttr): A list, tuple or something can be - converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is - a list or tuple, it's length must equal to `num_layers`. Otherwise, - construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`. - Default None. - dtype(string, optional): The data type used in this cell. It can be - float32 or float64. Default float32. - - Examples: - - .. code-block:: python - - import paddle + direction="forward", + dropout=0., + time_major=False, + weight_ih_attr=None, + weight_hh_attr=None, + bias_ih_attr=None, + bias_hh_attr=None, + name=None): + super(GRU, self).__init__() - inputs = paddle.rand((2, 4, 32)) - cell = paddle.StackedGRUCell(input_size=32, hidden_size=64) - rnn = paddle.RNN(cell=cell) - outputs, _ = rnn(inputs) # [2, 4, 64] - """ + if direction in ["forward", "backward"]: + is_reverse = direction == "backward" + cell = GRUCell(input_size, hidden_size, weight_ih_attr, + weight_hh_attr, bias_ih_attr, bias_hh_attr) + self.append(RNN(cell, is_reverse, time_major)) + for i in range(1, num_layers): + cell = GRUCell(hidden_size, hidden_size, weight_ih_attr, + weight_hh_attr, bias_ih_attr, bias_hh_attr) + self.append(RNN(cell, is_reverse, time_major)) + elif direction == "bidirectional": + cell_fw = GRUCell(input_size, hidden_size, weight_ih_attr, + weight_hh_attr, bias_ih_attr, bias_hh_attr) + cell_bw = GRUCell(input_size, hidden_size, weight_ih_attr, + weight_hh_attr, bias_ih_attr, bias_hh_attr) + self.append(BiRNN(cell_fw, cell_bw, time_major)) + for i in range(1, num_layers): + cell_fw = GRUCell(2 * hidden_size, hidden_size, weight_ih_attr, + weight_hh_attr, bias_ih_attr, bias_hh_attr) + cell_bw = GRUCell(2 * hidden_size, hidden_size, weight_ih_attr, + weight_hh_attr, bias_ih_attr, bias_hh_attr) + self.append(BiRNN(cell_fw, cell_bw, time_major)) + else: + raise ValueError( + "direction should be forward, backward or bidirectional, " + "received direction = {}".format(direction)) - def __init__(self, - input_size, - hidden_size, - num_layers=1, - dropout=0.0, - param_attr=None, - bias_attr=None, - dtype="float32"): - super(StackedGRUCell, self).__init__() - self.hidden_size = hidden_size self.input_size = input_size - self.num_layers = num_layers + self.hidden_size = hidden_size self.dropout = dropout - param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers) - bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers) - - self.cells = [] - for i in range(num_layers): - self.cells.append( - self.add_sublayer( - "gru_%d" % i, - GRUCell( - input_size=input_size if i == 0 else hidden_size, - hidden_size=hidden_size, - param_attr=param_attrs[i], - bias_attr=bias_attrs[i], - dtype=dtype))) - - def forward(self, inputs, states): - """ - Performs the stacked GRU cells sequentially. Each cell's `inputs` is - the `outputs` of the previous cell. And each cell's `states` is the - corresponding one in `states`. - - Parameters: - inputs (Variable): The inputs for the first cell. It is a float32 or - float64 tensor with shape `[batch_size, input_size]`. - states (list): A list containing states for all cells orderly. - - Returns: - tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \ - a tensor with shape `[batch_size, hidden_size]`, corresponding \ - to :math:`h_{t}` in the formula of the last GRU; `new_states` \ - is a list composed of every GRU `new_states` which is also \ - :math:`h_{t}` in the formula, and the data type and structure \ - of these tensors all is same as that of `states`. - """ - new_states = [] - for i, cell in enumerate(self.cells): - outputs, new_state = cell(inputs, states[i]) - outputs = layers.dropout( - outputs, - self.dropout, - dropout_implementation='upscale_in_train' - ) if self.dropout and i != (self.num_layers - 1) else outputs - inputs = outputs - new_states.append(new_state) - return outputs, new_states - - @property - def state_shape(self): - """ - The `state_shape` of StackedGRUCell is a list composed of each including - GRU cell's `state_shape`. - - Returns: - list: A list composed of each including GRU cell's `state_shape`. - """ - return [cell.state_shape for cell in self.cells] + self.num_directions = 2 if direction == "bidirectional" else 1 + self.time_major = time_major + self.num_layers = num_layers + self.state_components = 1 From 156b490b7ccf3dfbbd722a1dce06464a76e872be Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Mon, 24 Aug 2020 17:37:41 +0800 Subject: [PATCH 05/14] add unittets --- .../fluid/tests/unittests/CMakeLists.txt | 1 + .../fluid/tests/unittests/rnn/CMakeLists.txt | 6 + .../fluid/tests/unittests/rnn/__init__.py | 13 + .../fluid/tests/unittests/rnn/convert.py | 51 ++ .../fluid/tests/unittests/rnn/rnn_numpy.py | 515 ++++++++++++++++++ .../tests/unittests/rnn/test_rnn_cells.py | 164 ++++++ .../unittests/rnn/test_rnn_cells_static.py | 327 +++++++++++ .../tests/unittests/rnn/test_rnn_nets.py | 269 +++++++++ .../unittests/rnn/test_rnn_nets_static.py | 468 ++++++++++++++++ python/paddle/nn/layer/rnn.py | 4 +- 10 files changed, 1815 insertions(+), 3 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt create mode 100644 python/paddle/fluid/tests/unittests/rnn/__init__.py create mode 100644 python/paddle/fluid/tests/unittests/rnn/convert.py create mode 100644 python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py create mode 100644 python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py create mode 100644 python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py create mode 100644 python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py create mode 100644 python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 33d9326681d09..e601c5a080172 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -476,6 +476,7 @@ endif() add_subdirectory(sequence) add_subdirectory(dygraph_to_static) +add_subdirectory(rnn) if (WITH_MKLDNN) add_subdirectory(mkldnn) diff --git a/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt b/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt new file mode 100644 index 0000000000000..f71e04c09aa38 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt @@ -0,0 +1,6 @@ +file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") +string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") + +foreach(TEST_OP ${TEST_OPS}) + py_test_modules(${TEST_OP} MODULES ${TEST_OP}) +endforeach(TEST_OP) diff --git a/python/paddle/fluid/tests/unittests/rnn/__init__.py b/python/paddle/fluid/tests/unittests/rnn/__init__.py new file mode 100644 index 0000000000000..abf198b97e6e8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/rnn/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/paddle/fluid/tests/unittests/rnn/convert.py b/python/paddle/fluid/tests/unittests/rnn/convert.py new file mode 100644 index 0000000000000..02f10694a4b47 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/rnn/convert.py @@ -0,0 +1,51 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import numpy as np + + +def convert_params_for_cell(np_cell, paddle_cell): + state = np_cell.parameters + for k, v in paddle_cell.named_parameters(): + v.set_value(state[k]) + + +def convert_params_for_cell_static(np_cell, paddle_cell, place): + state = np_cell.parameters + for k, v in paddle_cell.named_parameters(): + scope = paddle.static.global_scope() + tensor = scope.find_var(v.name).get_tensor() + tensor.set(state[k], place) + + +def convert_params_for_net(np_net, paddle_net): + for np_layer, paddle_layer in zip(np_net, paddle_net): + if hasattr(np_layer, "cell"): + convert_params_for_cell(np_layer.cell, paddle_layer.cell) + else: + convert_params_for_cell(np_layer.cell_fw, paddle_layer.cell_fw) + convert_params_for_cell(np_layer.cell_bw, paddle_layer.cell_bw) + + +def convert_params_for_net_static(np_net, paddle_net, place): + for np_layer, paddle_layer in zip(np_net, paddle_net): + if hasattr(np_layer, "cell"): + convert_params_for_cell_static(np_layer.cell, paddle_layer.cell, + place) + else: + convert_params_for_cell_static(np_layer.cell_fw, + paddle_layer.cell_fw, place) + convert_params_for_cell_static(np_layer.cell_bw, + paddle_layer.cell_bw, place) diff --git a/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py b/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py new file mode 100644 index 0000000000000..725d7df2df3a5 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py @@ -0,0 +1,515 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import math + + +class LayerMixin(object): + def __call__(self, *args, **kwargs): + return self.forward(*args, **kwargs) + + +class LayerListMixin(LayerMixin): + def __init__(self, layers=None): + self._layers = list(layers) if layers else [] + + def append(self, layer): + self._layers.append(layer) + + def __iter__(self): + return iter(self._layers) + + +class SimpleRNNCell(LayerMixin): + def __init__(self, input_size, hidden_size, bias=True, nonlinearity="tanh"): + self.input_size = input_size + self.hidden_size = hidden_size + self.bias = bias + if nonlinearity == 'tanh': + self.nonlinearity = np.tanh + else: + self.nonlinearity = lambda x: np.maximum(x, 0.) + + self.parameters = dict() + std = 1.0 / math.sqrt(hidden_size) + self.weight_ih = np.random.uniform(-std, std, ( + hidden_size, input_size)).astype('float64') + self.weight_hh = np.random.uniform(-std, std, ( + hidden_size, hidden_size)).astype('float64') + self.parameters['weight_ih'] = self.weight_ih + self.parameters['weight_hh'] = self.weight_hh + if bias: + self.bias_ih = np.random.uniform(-std, std, + (hidden_size, )).astype('float64') + self.bias_hh = np.random.uniform(-std, std, + (hidden_size, )).astype('float64') + self.parameters['bias_ih'] = self.bias_ih + self.parameters['bias_hh'] = self.bias_hh + else: + self.bias_ih = None + self.bias_hh = None + + def init_state(self, inputs): + batch_size = inputs.shape[0] + return np.zeros((batch_size, self.hidden_size), dtype=inputs.dtype) + + def forward(self, inputs, hx=None): + if hx is None: + hx = self.init_state(inputs) + pre_h = hx + i2h = np.matmul(inputs, self.weight_ih.T) + if self.bias_ih is not None: + i2h += self.bias_ih + h2h = np.matmul(pre_h, self.weight_hh.T) + if self.bias_hh is not None: + h2h += self.bias_hh + h = self.nonlinearity(i2h + h2h) + return h, h + + +class GRUCell(LayerMixin): + def __init__(self, input_size, hidden_size, bias=True): + self.input_size = input_size + self.hidden_size = hidden_size + self.bias = bias + self.parameters = dict() + std = 1.0 / math.sqrt(hidden_size) + self.weight_ih = np.random.uniform(-std, std, ( + 3 * hidden_size, input_size)).astype('float64') + self.weight_hh = np.random.uniform(-std, std, ( + 3 * hidden_size, hidden_size)).astype('float64') + self.parameters['weight_ih'] = self.weight_ih + self.parameters['weight_hh'] = self.weight_hh + if bias: + self.bias_ih = np.random.uniform(-std, std, ( + 3 * hidden_size)).astype('float64') + self.bias_hh = np.random.uniform(-std, std, ( + 3 * hidden_size)).astype('float64') + self.parameters['bias_ih'] = self.bias_ih + self.parameters['bias_hh'] = self.bias_hh + else: + self.bias_ih = None + self.bias_hh = None + + def init_state(self, inputs): + batch_size = inputs.shape[0] + return np.zeros((batch_size, self.hidden_size), dtype=inputs.dtype) + + def forward(self, inputs, hx=None): + if hx is None: + hx = self.init_state(inputs) + pre_hidden = hx + x_gates = np.matmul(inputs, self.weight_ih.T) + if self.bias_ih is not None: + x_gates = x_gates + self.bias_ih + h_gates = np.matmul(pre_hidden, self.weight_hh.T) + if self.bias_hh is not None: + h_gates = h_gates + self.bias_hh + + x_r, x_z, x_c = np.split(x_gates, 3, 1) + h_r, h_z, h_c = np.split(h_gates, 3, 1) + + r = 1.0 / (1.0 + np.exp(-(x_r + h_r))) + z = 1.0 / (1.0 + np.exp(-(x_z + h_z))) + c = np.tanh(x_c + r * h_c) # apply reset gate after mm + h = (pre_hidden - c) * z + c + return h, h + + +class LSTMCell(LayerMixin): + def __init__(self, input_size, hidden_size, bias=True): + self.input_size = input_size + self.hidden_size = hidden_size + self.bias = bias + self.parameters = dict() + std = 1.0 / math.sqrt(hidden_size) + self.weight_ih = np.random.uniform(-std, std, ( + 4 * hidden_size, input_size)).astype('float64') + self.weight_hh = np.random.uniform(-std, std, ( + 4 * hidden_size, hidden_size)).astype('float64') + self.parameters['weight_ih'] = self.weight_ih + self.parameters['weight_hh'] = self.weight_hh + if bias: + self.bias_ih = np.random.uniform(-std, std, ( + 4 * hidden_size)).astype('float64') + self.bias_hh = np.random.uniform(-std, std, ( + 4 * hidden_size)).astype('float64') + self.parameters['bias_ih'] = self.bias_ih + self.parameters['bias_hh'] = self.bias_hh + else: + self.bias_ih = None + self.bias_hh = None + + def init_state(self, inputs): + batch_size = inputs.shape[0] + init_h = np.zeros((batch_size, self.hidden_size), dtype=inputs.dtype) + init_c = np.zeros((batch_size, self.hidden_size), dtype=inputs.dtype) + return init_h, init_c + + def forward(self, inputs, hx=None): + if hx is None: + hx = self.init_state(inputs) + pre_hidden, pre_cell = hx + gates = np.matmul(inputs, self.weight_ih.T) + if self.bias_ih is not None: + gates = gates + self.bias_ih + gates += np.matmul(pre_hidden, self.weight_hh.T) + if self.bias_hh is not None: + gates = gates + self.bias_hh + + chunked_gates = np.split(gates, 4, -1) + + i = 1.0 / (1.0 + np.exp(-chunked_gates[0])) + f = 1.0 / (1.0 + np.exp(-chunked_gates[1])) + o = 1.0 / (1.0 + np.exp(-chunked_gates[3])) + c = f * pre_cell + i * np.tanh(chunked_gates[2]) + h = o * np.tanh(c) + + return h, (h, c) + + +def sequence_mask(lengths, max_len=None): + if max_len is None: + max_len = np.max(lengths) + else: + assert max_len >= np.max(lengths) + return np.arange(max_len) < np.expand_dims(lengths, -1) + + +def update_state(mask, new, old): + if not isinstance(old, (tuple, list)): + return np.where(mask, new, old) + else: + return tuple(map(lambda x, y: np.where(mask, x, y), new, old)) + + +def rnn(cell, + inputs, + initial_states, + sequence_length=None, + time_major=False, + is_reverse=False): + if not time_major: + inputs = np.transpose(inputs, [1, 0, 2]) + if is_reverse: + inputs = np.flip(inputs, 0) + + if sequence_length is None: + mask = None + else: + mask = np.transpose(sequence_mask(sequence_length), [1, 0]) + mask = np.expand_dims(mask, -1) + if is_reverse: + mask = np.flip(mask, 0) + + time_steps = inputs.shape[0] + state = initial_states + outputs = [] + for t in range(time_steps): + x_t = inputs[t] + if mask is not None: + m_t = mask[t] + y, new_state = cell(x_t, state) + y = np.where(m_t, y, 0.) + outputs.append(y) + state = update_state(m_t, new_state, state) + else: + y, new_state = cell(x_t, state) + outputs.append(y) + state = new_state + + outputs = np.stack(outputs) + final_state = state + + if is_reverse: + outputs = np.flip(outputs, 0) + if not time_major: + outputs = np.transpose(outputs, [1, 0, 2]) + return outputs, final_state + + +def birnn(cell_fw, + cell_bw, + inputs, + initial_states, + sequence_length=None, + time_major=False): + states_fw, states_bw = initial_states + outputs_fw, states_fw = rnn(cell_fw, + inputs, + states_fw, + sequence_length, + time_major=time_major) + + outputs_bw, states_bw = rnn(cell_bw, + inputs, + states_bw, + sequence_length, + time_major=time_major, + is_reverse=True) + + outputs = np.concatenate((outputs_fw, outputs_bw), -1) + final_states = (states_fw, states_bw) + return outputs, final_states + + +def flatten(nested): + return list(_flatten(nested)) + + +def _flatten(nested): + for item in nested: + if isinstance(item, (list, tuple)): + yield from _flatten(item) + else: + yield item + + +def unstack(array, axis=0): + num = array.shape[axis] + sub_arrays = np.split(array, num, axis) + return [np.squeeze(sub_array, axis) for sub_array in sub_arrays] + + +def dropout(array, p=0.5): + if p == 0.0: + return array + + mask = (np.random.uniform(size=array.shape) < (1 - p)).astype(array.dtype) + return array * (mask / (1 - p)) + + +def split_states(states, bidirectional=False, state_components=1): + if state_components == 1: + states = unstack(states) + if not bidirectional: + return states + else: + return list(zip(states[::2], states[1::2])) + else: + assert len(states) == state_components + states = tuple([unstack(item) for item in states]) + if not bidirectional: + return list(zip(*states)) + else: + states = list(zip(*states)) + return list(zip(states[::2], states[1::2])) + + +def concat_states(states, bidirectional=False, state_components=1): + if state_components == 1: + return np.stack(flatten(states)) + else: + states = flatten(states) + componnets = [] + for i in range(state_components): + componnets.append(states[i::state_components]) + return [np.stack(item) for item in componnets] + + +class RNN(LayerMixin): + def __init__(self, cell, is_reverse=False, time_major=False): + super(RNN, self).__init__() + self.cell = cell + if not hasattr(self.cell, "call"): + # for non-dygraph mode, `rnn` api uses cell.call + self.cell.call = self.cell.forward + self.is_reverse = is_reverse + self.time_major = time_major + + def forward(self, inputs, initial_states=None, sequence_length=None): + final_outputs, final_states = rnn(self.cell, + inputs, + initial_states=initial_states, + sequence_length=sequence_length, + time_major=self.time_major, + is_reverse=self.is_reverse) + return final_outputs, final_states + + +class BiRNN(LayerMixin): + def __init__(self, cell_fw, cell_bw, time_major=False): + super(BiRNN, self).__init__() + self.cell_fw = cell_fw + self.cell_bw = cell_bw + self.time_major = time_major + + def forward(self, + inputs, + initial_states=None, + sequence_length=None, + **kwargs): + if isinstance(initial_states, (list, tuple)): + assert len(initial_states) == 2, \ + "length of initial_states should be 2 when it is a list/tuple" + else: + initial_states = [initial_states, initial_states] + + outputs, final_states = birnn(self.cell_fw, self.cell_bw, inputs, + initial_states, sequence_length, + self.time_major) + return outputs, final_states + + +class RNNMixin(LayerListMixin): + def forward(self, inputs, initial_states=None, sequence_length=None): + batch_index = 1 if self.time_major else 0 + batch_size = inputs.shape[batch_index] + dtype = inputs.dtype + if initial_states is None: + state_shape = (self.num_layers * self.num_directions, batch_size, + self.hidden_size) + if self.state_components == 1: + initial_states = np.zeros(state_shape, dtype) + else: + initial_states = tuple([ + np.zeros(state_shape, dtype) + for _ in range(self.state_components) + ]) + + states = split_states(initial_states, self.num_directions == 2, + self.state_components) + final_states = [] + + for i, rnn_layer in enumerate(self): + if i > 0: + inputs = dropout(inputs, self.dropout) + outputs, final_state = rnn_layer(inputs, states[i], sequence_length) + final_states.append(final_state) + inputs = outputs + + final_states = concat_states(final_states, self.num_directions == 2, + self.state_components) + return outputs, final_states + + +class SimpleRNN(RNNMixin): + def __init__(self, + input_size, + hidden_size, + num_layers=1, + nonlinearity="tanh", + direction="forward", + dropout=0., + time_major=False): + super(SimpleRNN, self).__init__() + + if direction in ["forward", "backward"]: + is_reverse = direction == "backward" + cell = SimpleRNNCell(input_size, hidden_size, nonlinearity) + self.append(RNN(cell, is_reverse, time_major)) + for i in range(1, num_layers): + cell = SimpleRNNCell(hidden_size, hidden_size, nonlinearity) + self.append(RNN(cell, is_reverse, time_major)) + elif direction == "bidirectional": + cell_fw = SimpleRNNCell(input_size, hidden_size, nonlinearity) + cell_bw = SimpleRNNCell(input_size, hidden_size, nonlinearity) + self.append(BiRNN(cell_fw, cell_bw, time_major)) + for i in range(1, num_layers): + cell_fw = SimpleRNNCell(2 * hidden_size, hidden_size, + nonlinearity) + cell_bw = SimpleRNNCell(2 * hidden_size, hidden_size, + nonlinearity) + self.append(BiRNN(cell_fw, cell_bw, time_major)) + else: + raise ValueError( + "direction should be forward, backward or bidirectional, " + "received direction = {}".format(direction)) + + self.input_size = input_size + self.hidden_size = hidden_size + self.dropout = dropout + self.num_directions = 2 if direction == "bidirectional" else 1 + self.time_major = time_major + self.num_layers = num_layers + self.state_components = 1 + + +class LSTM(RNNMixin): + def __init__(self, + input_size, + hidden_size, + num_layers=1, + direction="forward", + dropout=0., + time_major=False): + super(LSTM, self).__init__() + + if direction in ["forward", "backward"]: + is_reverse = direction == "backward" + cell = LSTMCell(input_size, hidden_size) + self.append(RNN(cell, is_reverse, time_major)) + for i in range(1, num_layers): + cell = LSTMCell(hidden_size, hidden_size) + self.append(RNN(cell, is_reverse, time_major)) + elif direction == "bidirectional": + cell_fw = LSTMCell(input_size, hidden_size) + cell_bw = LSTMCell(input_size, hidden_size) + self.append(BiRNN(cell_fw, cell_bw, time_major)) + for i in range(1, num_layers): + cell_fw = LSTMCell(2 * hidden_size, hidden_size) + cell_bw = LSTMCell(2 * hidden_size, hidden_size) + self.append(BiRNN(cell_fw, cell_bw, time_major)) + else: + raise ValueError( + "direction should be forward, backward or bidirectional, " + "received direction = {}".format(direction)) + + self.input_size = input_size + self.hidden_size = hidden_size + self.dropout = dropout + self.num_directions = 2 if direction == "bidirectional" else 1 + self.time_major = time_major + self.num_layers = num_layers + self.state_components = 2 + + +class GRU(RNNMixin): + def __init__(self, + input_size, + hidden_size, + num_layers=1, + direction="forward", + dropout=0., + time_major=False): + super(GRU, self).__init__() + + if direction in ["forward", "backward"]: + is_reverse = direction == "backward" + cell = GRUCell(input_size, hidden_size) + self.append(RNN(cell, is_reverse, time_major)) + for i in range(1, num_layers): + cell = GRUCell(hidden_size, hidden_size) + self.append(RNN(cell, is_reverse, time_major)) + elif direction == "bidirectional": + cell_fw = GRUCell(input_size, hidden_size) + cell_bw = GRUCell(input_size, hidden_size) + self.append(BiRNN(cell_fw, cell_bw, time_major)) + for i in range(1, num_layers): + cell_fw = GRUCell(2 * hidden_size, hidden_size) + cell_bw = GRUCell(2 * hidden_size, hidden_size) + self.append(BiRNN(cell_fw, cell_bw, time_major)) + else: + raise ValueError( + "direction should be forward, backward or bidirectional, " + "received direction = {}".format(direction)) + + self.input_size = input_size + self.hidden_size = hidden_size + self.dropout = dropout + self.num_directions = 2 if direction == "bidirectional" else 1 + self.time_major = time_major + self.num_layers = num_layers + self.state_components = 1 diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py new file mode 100644 index 0000000000000..78f4bbab3b354 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py @@ -0,0 +1,164 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +paddle.framework.set_default_dtype("float64") + +import numpy as np +import unittest + +from rnn_numpy import SimpleRNNCell, LSTMCell, GRUCell +from convert import convert_params_for_cell + + +class TestSimpleRNNCell(unittest.TestCase): + def __init__(self, bias=True, place="cpu"): + super(TestSimpleRNNCell, self).__init__(methodName="runTest") + self.bias = bias + self.place = paddle.CPUPlace() if place == "cpu" \ + else paddle.CUDAPlace(0) + + def setUp(self): + paddle.disable_static(self.place) + rnn1 = SimpleRNNCell(16, 32, bias=self.bias) + rnn2 = paddle.nn.SimpleRNNCell( + 16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias) + convert_params_for_cell(rnn1, rnn2) + + self.rnn1 = rnn1 + self.rnn2 = rnn2 + + def test_with_initial_state(self): + rnn1 = self.rnn1 + rnn2 = self.rnn2 + + x = np.random.randn(4, 16) + prev_h = np.random.randn(4, 32) + + y1, h1 = rnn1(x, prev_h) + y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h)) + np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5) + + def test_with_zero_state(self): + rnn1 = self.rnn1 + rnn2 = self.rnn2 + + x = np.random.randn(4, 16) + + y1, h1 = rnn1(x) + y2, h2 = rnn2(paddle.to_variable(x)) + np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5) + + def runTest(self): + self.test_with_initial_state() + self.test_with_zero_state() + + +class TestGRUCell(unittest.TestCase): + def __init__(self, bias=True, place="cpu"): + super(TestGRUCell, self).__init__(methodName="runTest") + self.bias = bias + self.place = paddle.CPUPlace() if place == "cpu" \ + else paddle.CUDAPlace(0) + + def setUp(self): + paddle.disable_static(self.place) + rnn1 = GRUCell(16, 32, bias=self.bias) + rnn2 = paddle.nn.GRUCell( + 16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias) + convert_params_for_cell(rnn1, rnn2) + + self.rnn1 = rnn1 + self.rnn2 = rnn2 + + def test_with_initial_state(self): + rnn1 = self.rnn1 + rnn2 = self.rnn2 + + x = np.random.randn(4, 16) + prev_h = np.random.randn(4, 32) + + y1, h1 = rnn1(x, prev_h) + y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h)) + np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5) + + def test_with_zero_state(self): + rnn1 = self.rnn1 + rnn2 = self.rnn2 + + x = np.random.randn(4, 16) + + y1, h1 = rnn1(x) + y2, h2 = rnn2(paddle.to_variable(x)) + np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5) + + def runTest(self): + self.test_with_initial_state() + self.test_with_zero_state() + + +class TestLSTMCell(unittest.TestCase): + def __init__(self, bias=True, place="cpu"): + super(TestLSTMCell, self).__init__(methodName="runTest") + self.bias = bias + self.place = paddle.CPUPlace() if place == "cpu" \ + else paddle.CUDAPlace(0) + + def setUp(self): + rnn1 = LSTMCell(16, 32, bias=self.bias) + rnn2 = paddle.nn.LSTMCell( + 16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias) + convert_params_for_cell(rnn1, rnn2) + + self.rnn1 = rnn1 + self.rnn2 = rnn2 + + def test_with_initial_state(self): + rnn1 = self.rnn1 + rnn2 = self.rnn2 + + x = np.random.randn(4, 16) + prev_h = np.random.randn(4, 32) + prev_c = np.random.randn(4, 32) + + y1, (h1, c1) = rnn1(x, (prev_h, prev_c)) + y2, (h2, c2) = rnn2( + paddle.to_variable(x), + (paddle.to_variable(prev_h), paddle.to_variable(prev_c))) + np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5) + np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5) + + def test_with_zero_state(self): + rnn1 = self.rnn1 + rnn2 = self.rnn2 + + x = np.random.randn(4, 16) + + y1, (h1, c1) = rnn1(x) + y2, (h2, c2) = rnn2(paddle.to_variable(x)) + np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5) + np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5) + + def runTest(self): + self.test_with_initial_state() + self.test_with_zero_state() + + +def load_tests(loader, tests, pattern): + suite = unittest.TestSuite() + for bias in [True, False]: + for device in ["cpu", "gpu"]: + for test_class in [TestSimpleRNNCell, TestGRUCell, TestLSTMCell]: + suite.addTest(test_class(bias, device)) + return suite diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py new file mode 100644 index 0000000000000..c371e4eff92e7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py @@ -0,0 +1,327 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +torch.set_default_dtype(torch.float64) + +import paddle +paddle.framework.set_default_dtype("float64") + +import numpy as np +import unittest + +from convert import convert_params_for_cell_static +from rnn_numpy import SimpleRNNCell, LSTMCell, GRUCell + + +class TestSimpleRNNCell(unittest.TestCase): + def __init__(self, bias=True, place="cpu"): + super(TestSimpleRNNCell, self).__init__(methodName="runTest") + self.bias = bias + self.place = paddle.CPUPlace() if place == "cpu" \ + else paddle.CUDAPlace(0) + + def setUp(self): + rnn1 = SimpleRNNCell(16, 32, bias=self.bias) + + mp = paddle.static.Program() + sp = paddle.static.Program() + with paddle.fluid.unique_name.guard(): + with paddle.static.program_guard(mp, sp): + rnn2 = paddle.nn.SimpleRNNCell( + 16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias) + + place = self.place + exe = paddle.static.Executor(place) + scope = paddle.fluid.Scope() + with paddle.static.scope_guard(scope): + exe.run(sp) + convert_params_for_cell_static(rnn1, rnn2, place) + + self.mp = mp + self.sp = sp + self.rnn1 = rnn1 + self.rnn2 = rnn2 + + self.executor = exe + self.scope = scope + + def test_with_initial_state(self): + mp = self.mp.clone() + sp = self.sp + rnn1 = self.rnn1 + rnn2 = self.rnn2 + exe = self.executor + scope = self.scope + + x = np.random.randn(4, 16) + prev_h = np.random.randn(4, 32) + + y1, h1 = rnn1(x, prev_h) + + with paddle.fluid.unique_name.guard(): + with paddle.static.program_guard(mp, sp): + x_data = paddle.data( + "input", [-1, 16], + dtype=paddle.framework.get_default_dtype()) + init_h = paddle.data( + "init_h", [-1, 32], + dtype=paddle.framework.get_default_dtype()) + y, h = rnn2(x_data, init_h) + + feed_dict = {x_data.name: x, init_h.name: prev_h} + with paddle.static.scope_guard(scope): + y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h]) + + np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5) + + def test_with_zero_state(self): + mp = self.mp.clone() + sp = self.sp + rnn1 = self.rnn1 + rnn2 = self.rnn2 + exe = self.executor + scope = self.scope + + x = np.random.randn(4, 16) + + y1, h1 = rnn1(x) + + with paddle.fluid.unique_name.guard(): + with paddle.static.program_guard(mp, sp): + x_data = paddle.data( + "input", [-1, 16], + dtype=paddle.framework.get_default_dtype()) + y, h = rnn2(x_data) + + feed_dict = {x_data.name: x} + + with paddle.static.scope_guard(scope): + y2, h2 = exe.run(mp, + feed=feed_dict, + fetch_list=[y, h], + use_prune=True) + + np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5) + + def runTest(self): + self.test_with_initial_state() + self.test_with_zero_state() + + +class TestGRUCell(unittest.TestCase): + def __init__(self, bias=True, place="cpu"): + super(TestGRUCell, self).__init__(methodName="runTest") + self.bias = bias + self.place = paddle.CPUPlace() if place == "cpu" \ + else paddle.CUDAPlace(0) + + def setUp(self): + rnn1 = GRUCell(16, 32, bias=self.bias) + + mp = paddle.static.Program() + sp = paddle.static.Program() + with paddle.fluid.unique_name.guard(): + with paddle.static.program_guard(mp, sp): + rnn2 = paddle.nn.GRUCell( + 16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias) + + place = self.place + exe = paddle.static.Executor(place) + scope = paddle.fluid.Scope() + with paddle.static.scope_guard(scope): + exe.run(sp) + convert_params_for_cell_static(rnn1, rnn2, place) + + self.mp = mp + self.sp = sp + self.rnn1 = rnn1 + self.rnn2 = rnn2 + + self.place = place + self.executor = exe + self.scope = scope + + def test_with_initial_state(self): + mp = self.mp.clone() + sp = self.sp + rnn1 = self.rnn1 + rnn2 = self.rnn2 + exe = self.executor + scope = self.scope + + x = np.random.randn(4, 16) + prev_h = np.random.randn(4, 32) + + y1, h1 = rnn1(x, prev_h) + + with paddle.fluid.unique_name.guard(): + with paddle.static.program_guard(mp, sp): + x_data = paddle.data( + "input", [-1, 16], + dtype=paddle.framework.get_default_dtype()) + init_h = paddle.data( + "init_h", [-1, 32], + dtype=paddle.framework.get_default_dtype()) + y, h = rnn2(x_data, init_h) + + feed_dict = {x_data.name: x, init_h.name: prev_h} + with paddle.static.scope_guard(scope): + y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h]) + + np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5) + + def test_with_zero_state(self): + mp = self.mp.clone() + sp = self.sp + rnn1 = self.rnn1 + rnn2 = self.rnn2 + exe = self.executor + scope = self.scope + + x = np.random.randn(4, 16) + + y1, h1 = rnn1(x) + + with paddle.fluid.unique_name.guard(): + with paddle.static.program_guard(mp, sp): + x_data = paddle.data( + "input", [-1, 16], + dtype=paddle.framework.get_default_dtype()) + y, h = rnn2(x_data) + + feed_dict = {x_data.name: x} + + with paddle.static.scope_guard(scope): + y2, h2 = exe.run(mp, + feed=feed_dict, + fetch_list=[y, h], + use_prune=True) + + np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5) + + def runTest(self): + self.test_with_initial_state() + self.test_with_zero_state() + + +class TestLSTMCell(unittest.TestCase): + def __init__(self, bias=True, place="cpu"): + super(TestLSTMCell, self).__init__(methodName="runTest") + self.bias = bias + self.place = paddle.CPUPlace() if place == "cpu" \ + else paddle.CUDAPlace(0) + + def setUp(self): + rnn1 = LSTMCell(16, 32, bias=self.bias) + + mp = paddle.static.Program() + sp = paddle.static.Program() + with paddle.fluid.unique_name.guard(): + with paddle.static.program_guard(mp, sp): + rnn2 = paddle.nn.LSTMCell( + 16, 32, bias_ih_attr=self.bias, bias_hh_attr=self.bias) + + place = self.place + exe = paddle.static.Executor(place) + scope = paddle.fluid.Scope() + with paddle.static.scope_guard(scope): + exe.run(sp) + convert_params_for_cell_static(rnn1, rnn2, place) + + self.mp = mp + self.sp = sp + self.rnn1 = rnn1 + self.rnn2 = rnn2 + + self.place = place + self.executor = exe + self.scope = scope + + def test_with_initial_state(self): + mp = self.mp.clone() + sp = self.sp + rnn1 = self.rnn1 + rnn2 = self.rnn2 + exe = self.executor + scope = self.scope + + x = np.random.randn(4, 16) + prev_h = np.random.randn(4, 32) + prev_c = np.random.randn(4, 32) + + y1, (h1, c1) = rnn1(x, (prev_h, prev_c)) + + with paddle.fluid.unique_name.guard(): + with paddle.static.program_guard(mp, sp): + x_data = paddle.data( + "input", [-1, 16], + dtype=paddle.framework.get_default_dtype()) + init_h = paddle.data( + "init_h", [-1, 32], + dtype=paddle.framework.get_default_dtype()) + init_c = paddle.data( + "init_c", [-1, 32], + dtype=paddle.framework.get_default_dtype()) + y, (h, c) = rnn2(x_data, (init_h, init_c)) + + feed_dict = {x_data.name: x, init_h.name: prev_h, init_c.name: prev_c} + with paddle.static.scope_guard(scope): + y2, h2, c2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h, c]) + + np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5) + np.testing.assert_allclose(c1, c2, atol=1e-8, rtol=1e-5) + + def test_with_zero_state(self): + mp = self.mp.clone() + sp = self.sp + rnn1 = self.rnn1 + rnn2 = self.rnn2 + exe = self.executor + scope = self.scope + + x = np.random.randn(4, 16) + + y1, (h1, c1) = rnn1(x) + + with paddle.fluid.unique_name.guard(): + with paddle.static.program_guard(mp, sp): + x_data = paddle.data( + "input", [-1, 16], + dtype=paddle.framework.get_default_dtype()) + y, (h, c) = rnn2(x_data) + + feed_dict = {x_data.name: x} + + with paddle.static.scope_guard(scope): + y2, h2, c2 = exe.run(mp, + feed=feed_dict, + fetch_list=[y, h, c], + use_prune=True) + + np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5) + np.testing.assert_allclose(c1, c2, atol=1e-8, rtol=1e-5) + + def runTest(self): + self.test_with_initial_state() + self.test_with_zero_state() + + +def load_tests(loader, tests, pattern): + suite = unittest.TestSuite() + for bias in [True, False]: + for device in ["cpu", "gpu"]: + for test_class in [TestSimpleRNNCell, TestLSTMCell, TestGRUCell]: + suite.addTest(test_class(bias, device)) + return suite diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py new file mode 100644 index 0000000000000..16c790000e862 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py @@ -0,0 +1,269 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +import paddle +paddle.set_default_dtype("float64") +from paddle.fluid.layers import sequence_mask + +import numpy as np +import unittest + +from convert import convert_params_for_net +from rnn_numpy import SimpleRNN, LSTM, GRU + + +class TestSimpleRNN(unittest.TestCase): + def __init__(self, time_major=True, direction="forward", place="cpu"): + super(TestSimpleRNN, self).__init__("runTest") + self.time_major = time_major + self.direction = direction + self.num_directions = 2 if direction == "bidirectional" else 1 + self.place = paddle.CPUPlace() if place == "cpu" \ + else paddle.CUDAPlace(0) + + def setUp(self): + paddle.disable_static(self.place) + rnn1 = SimpleRNN( + 16, 32, 2, time_major=self.time_major, direction=self.direction) + rnn2 = paddle.nn.SimpleRNN( + 16, 32, 2, time_major=self.time_major, direction=self.direction) + convert_params_for_net(rnn1, rnn2) + + self.rnn1 = rnn1 + self.rnn2 = rnn2 + + def test_with_initial_state(self): + rnn1 = self.rnn1 + rnn2 = self.rnn2 + + x = np.random.randn(12, 4, 16) + if not self.time_major: + x = np.transpose(x, [1, 0, 2]) + prev_h = np.random.randn(2 * self.num_directions, 4, 32) + + y1, h1 = rnn1(x, prev_h) + y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h)) + np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5) + np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5) + + def test_with_zero_state(self): + rnn1 = self.rnn1 + rnn2 = self.rnn2 + + x = np.random.randn(12, 4, 16) + if not self.time_major: + x = np.transpose(x, [1, 0, 2]) + + y1, h1 = rnn1(x) + y2, h2 = rnn2(paddle.to_variable(x)) + np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5) + np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5) + + def test_with_input_lengths(self): + rnn1 = self.rnn1 + rnn2 = self.rnn2 + + x = np.random.randn(12, 4, 16) + if not self.time_major: + x = np.transpose(x, [1, 0, 2]) + sequence_length = np.array([12, 10, 9, 8], dtype=np.int64) + + y1, h1 = rnn1(x, sequence_length=sequence_length) + + seq_len = paddle.to_variable(sequence_length) + mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype()) + if self.time_major: + mask = paddle.transpose(mask, [1, 0]) + y2, h2 = rnn2(paddle.to_variable(x), sequence_length=seq_len) + y2 = paddle.multiply(y2, mask, axis=0) + + np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5) + np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5) + + def runTest(self): + self.test_with_initial_state() + self.test_with_zero_state() + self.test_with_input_lengths() + + +class TestGRU(unittest.TestCase): + def __init__(self, time_major=True, direction="forward", place="cpu"): + super(TestGRU, self).__init__("runTest") + self.time_major = time_major + self.direction = direction + self.num_directions = 2 if direction == "bidirectional" else 1 + self.place = paddle.CPUPlace() if place == "cpu" \ + else paddle.CUDAPlace(0) + + def setUp(self): + paddle.disable_static(self.place) + rnn1 = GRU(16, + 32, + 2, + time_major=self.time_major, + direction=self.direction) + rnn2 = paddle.nn.GRU(16, + 32, + 2, + time_major=self.time_major, + direction=self.direction) + convert_params_for_net(rnn1, rnn2) + + self.rnn1 = rnn1 + self.rnn2 = rnn2 + + def test_with_initial_state(self): + rnn1 = self.rnn1 + rnn2 = self.rnn2 + + x = np.random.randn(12, 4, 16) + if not self.time_major: + x = np.transpose(x, [1, 0, 2]) + prev_h = np.random.randn(2 * self.num_directions, 4, 32) + + y1, h1 = rnn1(x, prev_h) + y2, h2 = rnn2(paddle.to_variable(x), paddle.to_variable(prev_h)) + np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5) + np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5) + + def test_with_zero_state(self): + rnn1 = self.rnn1 + rnn2 = self.rnn2 + + x = np.random.randn(12, 4, 16) + if not self.time_major: + x = np.transpose(x, [1, 0, 2]) + + y1, h1 = rnn1(x) + y2, h2 = rnn2(paddle.to_variable(x)) + np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5) + np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5) + + def test_with_input_lengths(self): + rnn1 = self.rnn1 + rnn2 = self.rnn2 + + x = np.random.randn(12, 4, 16) + if not self.time_major: + x = np.transpose(x, [1, 0, 2]) + sequence_length = np.array([12, 10, 9, 8], dtype=np.int64) + + y1, h1 = rnn1(x, sequence_length=sequence_length) + + seq_len = paddle.to_variable(sequence_length) + mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype()) + if self.time_major: + mask = paddle.transpose(mask, [1, 0]) + y2, h2 = rnn2(paddle.to_variable(x), sequence_length=seq_len) + y2 = paddle.multiply(y2, mask, axis=0) + + np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5) + np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5) + + def runTest(self): + self.test_with_initial_state() + self.test_with_zero_state() + self.test_with_input_lengths() + + +class TestLSTM(unittest.TestCase): + def __init__(self, time_major=True, direction="forward", place="cpu"): + super(TestLSTM, self).__init__("runTest") + self.time_major = time_major + self.direction = direction + self.num_directions = 2 if direction == "bidirectional" else 1 + self.place = paddle.CPUPlace() if place == "cpu" \ + else paddle.CUDAPlace(0) + + def setUp(self): + paddle.disable_static(self.place) + rnn1 = LSTM( + 16, 32, 2, time_major=self.time_major, direction=self.direction) + rnn2 = paddle.nn.LSTM( + 16, 32, 2, time_major=self.time_major, direction=self.direction) + convert_params_for_net(rnn1, rnn2) + + self.rnn1 = rnn1 + self.rnn2 = rnn2 + + def test_with_initial_state(self): + rnn1 = self.rnn1 + rnn2 = self.rnn2 + + x = np.random.randn(12, 4, 16) + if not self.time_major: + x = np.transpose(x, [1, 0, 2]) + prev_h = np.random.randn(2 * self.num_directions, 4, 32) + prev_c = np.random.randn(2 * self.num_directions, 4, 32) + + y1, (h1, c1) = rnn1(x, (prev_h, prev_c)) + y2, (h2, c2) = rnn2( + paddle.to_variable(x), + (paddle.to_variable(prev_h), paddle.to_variable(prev_c))) + np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5) + np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5) + np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5) + + def test_with_zero_state(self): + rnn1 = self.rnn1 + rnn2 = self.rnn2 + + x = np.random.randn(12, 4, 16) + if not self.time_major: + x = np.transpose(x, [1, 0, 2]) + + y1, (h1, c1) = rnn1(x) + y2, (h2, c2) = rnn2(paddle.to_variable(x)) + np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5) + np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5) + np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5) + + def test_with_input_lengths(self): + rnn1 = self.rnn1 + rnn2 = self.rnn2 + + x = np.random.randn(12, 4, 16) + if not self.time_major: + x = np.transpose(x, [1, 0, 2]) + sequence_length = np.array([12, 10, 9, 8], dtype=np.int64) + + y1, (h1, c1) = rnn1(x, sequence_length=sequence_length) + + seq_len = paddle.to_variable(sequence_length) + mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype()) + if self.time_major: + mask = paddle.transpose(mask, [1, 0]) + y2, (h2, c2) = rnn2(paddle.to_variable(x), sequence_length=seq_len) + y2 = paddle.multiply(y2, mask, axis=0) + + np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5) + np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5) + np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5) + + def runTest(self): + self.test_with_initial_state() + self.test_with_zero_state() + self.test_with_input_lengths() + + +def load_tests(loader, tests, pattern): + suite = unittest.TestSuite() + for direction in ["forward", "backward", "bidirectional"]: + for time_major in [True, False]: + for device in ["cpu", "gpu"]: + for test_class in [TestSimpleRNN, TestLSTM, TestGRU]: + suite.addTest(test_class(time_major, direction, device)) + return suite diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py new file mode 100644 index 0000000000000..3620768262b5c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py @@ -0,0 +1,468 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +paddle.set_default_dtype("float64") +from paddle.fluid.layers import sequence_mask + +import numpy as np +import unittest + +from convert import convert_params_for_net_static +from rnn_numpy import SimpleRNN, LSTM, GRU + + +class TestSimpleRNN(unittest.TestCase): + def __init__(self, time_major=True, direction="forward", place="cpu"): + super(TestSimpleRNN, self).__init__("runTest") + self.time_major = time_major + self.direction = direction + self.num_directions = 2 if direction == "bidirectional" else 1 + self.place = paddle.CPUPlace() if place == "cpu" \ + else paddle.CUDAPlace(0) + + def setUp(self): + rnn1 = SimpleRNN( + 16, 32, 2, time_major=self.time_major, direction=self.direction) + + mp = paddle.static.Program() + sp = paddle.static.Program() + with paddle.fluid.unique_name.guard(): + with paddle.static.program_guard(mp, sp): + rnn2 = paddle.nn.SimpleRNN( + 16, + 32, + 2, + time_major=self.time_major, + direction=self.direction) + + place = self.place + exe = paddle.static.Executor(place) + scope = paddle.fluid.Scope() + with paddle.static.scope_guard(scope): + exe.run(sp) + convert_params_for_net_static(rnn1, rnn2, place) + + self.mp = mp + self.sp = sp + self.rnn1 = rnn1 + self.rnn2 = rnn2 + + self.place = place + self.executor = exe + self.scope = scope + + def test_with_initial_state(self): + mp = self.mp.clone().clone() + sp = self.sp + rnn1 = self.rnn1 + rnn2 = self.rnn2 + exe = self.executor + scope = self.scope + + x = np.random.randn(12, 4, 16) + if not self.time_major: + x = np.transpose(x, [1, 0, 2]) + prev_h = np.random.randn(2 * self.num_directions, 4, 32) + + y1, h1 = rnn1(x, prev_h) + + with paddle.fluid.unique_name.guard(): + with paddle.static.program_guard(mp, sp): + x_data = paddle.data( + "input", [-1, -1, 16], + dtype=paddle.framework.get_default_dtype()) + init_h = paddle.data( + "init_h", [2 * self.num_directions, -1, 32], + dtype=paddle.framework.get_default_dtype()) + y, h = rnn2(x_data, init_h) + + feed_dict = {x_data.name: x, init_h.name: prev_h} + with paddle.static.scope_guard(scope): + y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h]) + + np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5) + np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5) + + def test_with_zero_state(self): + mp = self.mp.clone() + sp = self.sp + rnn1 = self.rnn1 + rnn2 = self.rnn2 + exe = self.executor + scope = self.scope + + x = np.random.randn(12, 4, 16) + if not self.time_major: + x = np.transpose(x, [1, 0, 2]) + + y1, h1 = rnn1(x) + + with paddle.fluid.unique_name.guard(): + with paddle.static.program_guard(mp, sp): + x_data = paddle.data( + "input", [-1, -1, 16], + dtype=paddle.framework.get_default_dtype()) + y, h = rnn2(x_data) + + feed_dict = {x_data.name: x} + + with paddle.static.scope_guard(scope): + y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h]) + + np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5) + np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5) + + def test_with_input_lengths(self): + mp = self.mp.clone() + sp = self.sp + rnn1 = self.rnn1 + rnn2 = self.rnn2 + exe = self.executor + scope = self.scope + + x = np.random.randn(12, 4, 16) + if not self.time_major: + x = np.transpose(x, [1, 0, 2]) + sequence_length = np.array([12, 10, 9, 8], dtype=np.int64) + + y1, h1 = rnn1(x, sequence_length=sequence_length) + + with paddle.fluid.unique_name.guard(): + with paddle.static.program_guard(mp, sp): + x_data = paddle.data( + "input", [-1, -1, 16], + dtype=paddle.framework.get_default_dtype()) + seq_len = paddle.data("seq_len", [-1], dtype="int64") + mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype()) + if self.time_major: + mask = paddle.transpose(mask, [1, 0]) + y, h = rnn2(x_data, sequence_length=seq_len) + y = paddle.multiply(y, mask, axis=0) + + feed_dict = {x_data.name: x, seq_len.name: sequence_length} + + with paddle.static.scope_guard(scope): + y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h]) + + np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5) + np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5) + + def runTest(self): + self.test_with_initial_state() + self.test_with_zero_state() + self.test_with_input_lengths() + + +class TestGRU(unittest.TestCase): + def __init__(self, time_major=True, direction="forward", place="cpu"): + super(TestGRU, self).__init__("runTest") + self.time_major = time_major + self.direction = direction + self.num_directions = 2 if direction == "bidirectional" else 1 + self.place = paddle.CPUPlace() if place == "cpu" \ + else paddle.CUDAPlace(0) + + def setUp(self): + rnn1 = GRU(16, + 32, + 2, + time_major=self.time_major, + direction=self.direction) + + mp = paddle.static.Program() + sp = paddle.static.Program() + with paddle.fluid.unique_name.guard(): + with paddle.static.program_guard(mp, sp): + rnn2 = paddle.nn.GRU(16, + 32, + 2, + time_major=self.time_major, + direction=self.direction) + + place = self.place + exe = paddle.static.Executor(place) + scope = paddle.fluid.Scope() + with paddle.static.scope_guard(scope): + exe.run(sp) + convert_params_for_net_static(rnn1, rnn2, place) + + self.mp = mp + self.sp = sp + self.rnn1 = rnn1 + self.rnn2 = rnn2 + + self.place = place + self.executor = exe + self.scope = scope + + def test_with_initial_state(self): + mp = self.mp.clone() + sp = self.sp + rnn1 = self.rnn1 + rnn2 = self.rnn2 + exe = self.executor + scope = self.scope + + x = np.random.randn(12, 4, 16) + if not self.time_major: + x = np.transpose(x, [1, 0, 2]) + + prev_h = np.random.randn(2 * self.num_directions, 4, 32) + + y1, h1 = rnn1(x, prev_h) + + with paddle.fluid.unique_name.guard(): + with paddle.static.program_guard(mp, sp): + x_data = paddle.data( + "input", [-1, -1, 16], + dtype=paddle.framework.get_default_dtype()) + init_h = paddle.data( + "init_h", [2 * self.num_directions, -1, 32], + dtype=paddle.framework.get_default_dtype()) + y, h = rnn2(x_data, init_h) + + feed_dict = {x_data.name: x, init_h.name: prev_h} + with paddle.static.scope_guard(scope): + y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h]) + + np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5) + np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5) + + def test_with_zero_state(self): + mp = self.mp.clone() + sp = self.sp + rnn1 = self.rnn1 + rnn2 = self.rnn2 + exe = self.executor + scope = self.scope + + x = np.random.randn(12, 4, 16) + if not self.time_major: + x = np.transpose(x, [1, 0, 2]) + + y1, h1 = rnn1(x) + + with paddle.fluid.unique_name.guard(): + with paddle.static.program_guard(mp, sp): + x_data = paddle.data( + "input", [-1, -1, 16], + dtype=paddle.framework.get_default_dtype()) + y, h = rnn2(x_data) + + feed_dict = {x_data.name: x} + + with paddle.static.scope_guard(scope): + y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h]) + + np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5) + np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5) + + def test_with_input_lengths(self): + mp = self.mp.clone() + sp = self.sp + rnn1 = self.rnn1 + rnn2 = self.rnn2 + exe = self.executor + scope = self.scope + + x = np.random.randn(12, 4, 16) + if not self.time_major: + x = np.transpose(x, [1, 0, 2]) + sequence_length = np.array([12, 10, 9, 8], dtype=np.int64) + + y1, h1 = rnn1(x, sequence_length=sequence_length) + + with paddle.fluid.unique_name.guard(): + with paddle.static.program_guard(mp, sp): + x_data = paddle.data( + "input", [-1, -1, 16], + dtype=paddle.framework.get_default_dtype()) + seq_len = paddle.data("seq_len", [-1], dtype="int64") + mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype()) + if self.time_major: + mask = paddle.transpose(mask, [1, 0]) + y, h = rnn2(x_data, sequence_length=seq_len) + y = paddle.multiply(y, mask, axis=0) + + feed_dict = {x_data.name: x, seq_len.name: sequence_length} + + with paddle.static.scope_guard(scope): + y2, h2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h]) + + np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5) + np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5) + + def runTest(self): + self.test_with_initial_state() + self.test_with_zero_state() + + +class TestLSTM(unittest.TestCase): + def __init__(self, time_major=True, direction="forward", place="cpu"): + super(TestLSTM, self).__init__("runTest") + self.time_major = time_major + self.direction = direction + self.num_directions = 2 if direction == "bidirectional" else 1 + self.place = paddle.CPUPlace() if place == "cpu" \ + else paddle.CUDAPlace(0) + + def setUp(self): + rnn1 = LSTM( + 16, 32, 2, time_major=self.time_major, direction=self.direction) + + mp = paddle.static.Program() + sp = paddle.static.Program() + with paddle.fluid.unique_name.guard(): + with paddle.static.program_guard(mp, sp): + rnn2 = paddle.nn.LSTM( + 16, + 32, + 2, + time_major=self.time_major, + direction=self.direction) + + place = self.place + exe = paddle.static.Executor(place) + scope = paddle.fluid.Scope() + with paddle.static.scope_guard(scope): + exe.run(sp) + convert_params_for_net_static(rnn1, rnn2, place) + + self.mp = mp + self.sp = sp + self.rnn1 = rnn1 + self.rnn2 = rnn2 + + self.place = place + self.executor = exe + self.scope = scope + + def test_with_initial_state(self): + mp = self.mp.clone() + sp = self.sp + rnn1 = self.rnn1 + rnn2 = self.rnn2 + exe = self.executor + scope = self.scope + + x = np.random.randn(12, 4, 16) + if not self.time_major: + x = np.transpose(x, [1, 0, 2]) + prev_h = np.random.randn(2 * self.num_directions, 4, 32) + prev_c = np.random.randn(2 * self.num_directions, 4, 32) + + y1, (h1, c1) = rnn1(x, (prev_h, prev_c)) + + with paddle.fluid.unique_name.guard(): + with paddle.static.program_guard(mp, sp): + x_data = paddle.data( + "input", [-1, -1, 16], + dtype=paddle.framework.get_default_dtype()) + init_h = paddle.data( + "init_h", [2 * self.num_directions, -1, 32], + dtype=paddle.framework.get_default_dtype()) + init_c = paddle.data( + "init_c", [2 * self.num_directions, -1, 32], + dtype=paddle.framework.get_default_dtype()) + y, (h, c) = rnn2(x_data, (init_h, init_c)) + + feed_dict = {x_data.name: x, init_h.name: prev_h, init_c.name: prev_c} + with paddle.static.scope_guard(scope): + y2, h2, c2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h, c]) + + np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5) + np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5) + np.testing.assert_allclose(c1, c2, atol=1e-8, rtol=1e-5) + + def test_with_zero_state(self): + mp = self.mp.clone() + sp = self.sp + rnn1 = self.rnn1 + rnn2 = self.rnn2 + exe = self.executor + scope = self.scope + + x = np.random.randn(12, 4, 16) + if not self.time_major: + x = np.transpose(x, [1, 0, 2]) + + y1, (h1, c1) = rnn1(x) + + with paddle.fluid.unique_name.guard(): + with paddle.static.program_guard(mp, sp): + x_data = paddle.data( + "input", [-1, -1, 16], + dtype=paddle.framework.get_default_dtype()) + y, (h, c) = rnn2(x_data) + + feed_dict = {x_data.name: x} + + with paddle.static.scope_guard(scope): + y2, h2, c2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h, c]) + + np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5) + np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5) + np.testing.assert_allclose(c1, c2, atol=1e-8, rtol=1e-5) + + def test_with_input_lengths(self): + mp = self.mp.clone() + sp = self.sp + rnn1 = self.rnn1 + rnn2 = self.rnn2 + exe = self.executor + scope = self.scope + + x = np.random.randn(12, 4, 16) + if not self.time_major: + x = np.transpose(x, [1, 0, 2]) + sequence_length = np.array([12, 10, 9, 8], dtype=np.int64) + + y1, (h1, c1) = rnn1(x, sequence_length=sequence_length) + + with paddle.fluid.unique_name.guard(): + with paddle.static.program_guard(mp, sp): + x_data = paddle.data( + "input", [-1, -1, 16], + dtype=paddle.framework.get_default_dtype()) + seq_len = paddle.data("seq_len", [-1], dtype="int64") + mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype()) + if self.time_major: + mask = paddle.transpose(mask, [1, 0]) + y, (h, c) = rnn2(x_data, sequence_length=seq_len) + y = paddle.multiply(y, mask, axis=0) + + feed_dict = {x_data.name: x, seq_len.name: sequence_length} + + with paddle.static.scope_guard(scope): + y2, h2, c2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h, c]) + + np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5) + np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5) + np.testing.assert_allclose(c1, c2, atol=1e-8, rtol=1e-5) + + def runTest(self): + self.test_with_initial_state() + self.test_with_zero_state() + self.test_with_input_lengths() + + +def load_tests(loader, tests, pattern): + suite = unittest.TestSuite() + for direction in ["forward", "backward", "bidirectional"]: + for time_major in [True, False]: + for device in ["cpu", "gpu"]: + for test_class in [TestSimpleRNN, TestLSTM, TestGRU]: + suite.addTest(test_class(time_major, direction, device)) + return suite diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py index 44710975edfd6..b15a4310dad8a 100644 --- a/python/paddle/nn/layer/rnn.py +++ b/python/paddle/nn/layer/rnn.py @@ -837,9 +837,7 @@ def forward(self, inputs, initial_states=None, sequence_length=None): for i, rnn_layer in enumerate(self): if i > 0: inputs = F.dropout( - inputs, - self.dropout, - dropout_implementation="upscale_in_train") + inputs, self.dropout, mode="upscale_in_train") outputs, final_state = rnn_layer(inputs, states[i], sequence_length) final_states.append(final_state) inputs = outputs From 14574d4407cc836155cbe20c5857ac904eebdfb5 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Mon, 24 Aug 2020 20:04:54 +0800 Subject: [PATCH 06/14] disable gpu tests when paddle is not compiled with cuda support --- python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py | 4 +++- .../fluid/tests/unittests/rnn/test_rnn_cells_static.py | 6 ++++-- python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py | 4 +++- .../fluid/tests/unittests/rnn/test_rnn_nets_static.py | 4 +++- 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py index 78f4bbab3b354..8d2677229a03f 100644 --- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py +++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py @@ -157,8 +157,10 @@ def runTest(self): def load_tests(loader, tests, pattern): suite = unittest.TestSuite() + devices = ["cpu", "gpu"] if paddle.fluid.is_compiled_with_cuda() \ + else ["cpu"] for bias in [True, False]: - for device in ["cpu", "gpu"]: + for device in devices: for test_class in [TestSimpleRNNCell, TestGRUCell, TestLSTMCell]: suite.addTest(test_class(bias, device)) return suite diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py index c371e4eff92e7..ede4c3ac189d4 100644 --- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py +++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py @@ -320,8 +320,10 @@ def runTest(self): def load_tests(loader, tests, pattern): suite = unittest.TestSuite() + devices = ["cpu", "gpu"] if paddle.fluid.is_compiled_with_cuda() \ + else ["cpu"] for bias in [True, False]: - for device in ["cpu", "gpu"]: - for test_class in [TestSimpleRNNCell, TestLSTMCell, TestGRUCell]: + for device in devices: + for test_class in [TestSimpleRNNCell, TestGRUCell, TestLSTMCell]: suite.addTest(test_class(bias, device)) return suite diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py index 16c790000e862..53b69efb2b3c8 100644 --- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py +++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py @@ -261,9 +261,11 @@ def runTest(self): def load_tests(loader, tests, pattern): suite = unittest.TestSuite() + devices = ["cpu", "gpu"] if paddle.fluid.is_compiled_with_cuda() \ + else ["cpu"] for direction in ["forward", "backward", "bidirectional"]: for time_major in [True, False]: - for device in ["cpu", "gpu"]: + for device in devices: for test_class in [TestSimpleRNN, TestLSTM, TestGRU]: suite.addTest(test_class(time_major, direction, device)) return suite diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py index 3620768262b5c..90ed6b8b4c907 100644 --- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py +++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets_static.py @@ -460,9 +460,11 @@ def runTest(self): def load_tests(loader, tests, pattern): suite = unittest.TestSuite() + devices = ["cpu", "gpu"] if paddle.fluid.is_compiled_with_cuda() \ + else ["cpu"] for direction in ["forward", "backward", "bidirectional"]: for time_major in [True, False]: - for device in ["cpu", "gpu"]: + for device in devices: for test_class in [TestSimpleRNN, TestLSTM, TestGRU]: suite.addTest(test_class(time_major, direction, device)) return suite From 779e2263610b0ca27b0c5421499a5a7b16baa275 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Tue, 25 Aug 2020 09:39:57 +0800 Subject: [PATCH 07/14] remove unnecessary imports --- .../paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py | 3 --- python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py | 2 -- 2 files changed, 5 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py index ede4c3ac189d4..948e47d5b9946 100644 --- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py +++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells_static.py @@ -12,9 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import torch -torch.set_default_dtype(torch.float64) - import paddle paddle.framework.set_default_dtype("float64") diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py index 53b69efb2b3c8..ef297b3bb6249 100644 --- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py +++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import torch - import paddle paddle.set_default_dtype("float64") from paddle.fluid.layers import sequence_mask From 565ddb95445c9f00f15412d7631c67d3487b20d7 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Tue, 25 Aug 2020 10:18:06 +0800 Subject: [PATCH 08/14] fix docstring --- python/paddle/nn/layer/rnn.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py index b15a4310dad8a..c24db317622d7 100644 --- a/python/paddle/nn/layer/rnn.py +++ b/python/paddle/nn/layer/rnn.py @@ -288,7 +288,9 @@ class SimpleRNNCell(RNNCellBase): None). For more information, please refer to :ref:`api_guide_Name`. Examples: + .. code-block:: python + import paddle paddle.disable_static() @@ -413,6 +415,7 @@ class LSTMCell(RNNCellBase): None). For more information, please refer to :ref:`api_guide_Name`. Examples: + .. code-block:: python import paddle @@ -546,6 +549,7 @@ class GRUCell(RNNCellBase): None). For more information, please refer to :ref:`api_guide_Name`. Examples: + .. code-block:: python import paddle @@ -688,6 +692,7 @@ class RNN(Layer): shape and dtype as the corresponding tensor in initial states. Examples: + .. code-block:: python import paddle @@ -767,6 +772,7 @@ class BiRNN(Layer): cell and backward cell. Examples: + .. code-block:: python import paddle @@ -911,6 +917,7 @@ class SimpleRNN(RNNMixin): `[num_lauers * num_directions, batch_size, hidden_size]`. Examples: + .. code-block:: python import paddle @@ -1047,6 +1054,7 @@ class LSTM(RNNMixin): the shape of each is `[num_lauers * num_directions, batch_size, hidden_size]`. Examples: + .. code-block:: python import paddle @@ -1175,6 +1183,7 @@ class GRU(RNNMixin): `[num_lauers * num_directions, batch_size, hidden_size]`. Examples: + .. code-block:: python import paddle From 9569a55a5955ee6f10df63144001aee310e7efd6 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Tue, 25 Aug 2020 10:43:38 +0800 Subject: [PATCH 09/14] add to no_sample wlist --- tools/wlist.json | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tools/wlist.json b/tools/wlist.json index 6a0360fbcd9d0..64949d7a28cc5 100644 --- a/tools/wlist.json +++ b/tools/wlist.json @@ -142,7 +142,20 @@ "Callback.on_eval_batch_end", "Callback.on_test_batch_begin", "Callback.on_test_batch_end", - "Model.prepare" + "Model.prepare", + "SimpleRNNCell", + "SimpleRNNCell.forward", + "LSTMCell", + "LSTMCell.forward", + "GRUCell", + "GRUCell.forward", + "SimpleRNN", + "GRU", + "LSTM", + "RNN", + "BiRNN", + "RNNCellBase", + "RNNCellBase.get_initial_states" ], "wlist_no_op_pass":[ "gelu", From 07bde98e2efceb17ddef28e1b2a715a1b5626cad Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Tue, 25 Aug 2020 14:00:04 +0800 Subject: [PATCH 10/14] backport to python2 to avoid yield from --- python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py b/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py index 725d7df2df3a5..7e0b8374b95cf 100644 --- a/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py +++ b/python/paddle/fluid/tests/unittests/rnn/rnn_numpy.py @@ -272,7 +272,8 @@ def flatten(nested): def _flatten(nested): for item in nested: if isinstance(item, (list, tuple)): - yield from _flatten(item) + for subitem in _flatten(item): + yield subitem else: yield item From ed3d925272faf18fd4636edd9fb9bc8e953cb822 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Wed, 26 Aug 2020 10:48:28 +0800 Subject: [PATCH 11/14] add **kwargs, fix typos --- python/paddle/fluid/layers/rnn.py | 90 +++--- python/paddle/nn/layer/rnn.py | 505 ++++++++++++++++++------------ 2 files changed, 344 insertions(+), 251 deletions(-) diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py index ae6539370f25f..d1b0e4961138c 100644 --- a/python/paddle/fluid/layers/rnn.py +++ b/python/paddle/fluid/layers/rnn.py @@ -38,6 +38,7 @@ 'Decoder', 'BeamSearchDecoder', 'rnn', + 'birnn', 'dynamic_decode', 'DecodeHelper', 'TrainingHelper', @@ -444,33 +445,29 @@ def rnn(cell, Arguments: cell(RNNCellBase): An instance of `RNNCellBase`. - inputs(Tensor): A (possibly nested structure of) tensor[s]. - The shape of tensor should be `[batch_size, sequence_length, ...]` - for `time_major == False` or `[sequence_length, batch_size, ...]` - for `time_major == True`. It represents the inputs to be unrolled - in RNN. - initial_states(Tensor, optional): A (possibly nested structure of) - tensor[s], representing the initial state for RNN. - If not provided, `cell.get_initial_states` would be used to produce - the initial state. Default None. - sequence_length(Tensor, optional): A tensor with shape `[batch_size]`. - It stores real length of each instance, thus enables users to extract - the last valid state when past a batch element's sequence length for - correctness. If not provided, the paddings would be treated same as - non-padding inputs. Default None. - time_major(bool, optional): Indicate the data layout of Tensor included - in `input` and `output` tensors. If `False`, the data layout would - be batch major with shape `[batch_size, sequence_length, ...]`. If - `True`, the data layout would be time major with shape - `[sequence_length, batch_size, ...]`. Default: `False`. - is_reverse(bool, optional): Indicate whether to calculate in the reverse - order of input sequences. Default: `False`. - **kwargs: Additional keyword arguments. Arguments passed to `cell.call`. + inputs(Tensor): the input sequences. + If time_major is True, the shape is + `[time_steps, batch_size, input_size]` + else the shape is `[batch_size, time_steps, input_size]`. + initial_states(Tensor|tuple|list, optional): the initial state of the + rnn cell. Tensor or a possibly nested structure of tensors. If not + provided, `cell.get_initial_states` would be called to produce + the initial state. Defaults to None. + sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 + or int32. The valid lengths of input sequences. Defaults to None. + If `sequence_length` is not None, the inputs are treated as + padded sequences. In each input sequence, elements whose time step + index are not less than the valid length are treated as paddings. + time_major (bool): Whether the first dimension of the input means the + time steps. Defaults to False. + is_reverse (bool, optional): Indicate whether to calculate in the reverse + order of input sequences. Defaults to False. + **kwargs: Additional keyword arguments to pass to `forward` of the cell. Returns: (outputs, final_states) outputs (Tensor|list|tuple): the output sequence. Tensor or nested - structure of Tensor. + structure of Tensors. If `time_major` is True, the shape of each tensor in outpus is `[time_steps, batch_size, hidden_size]`, else `[batch_size, time_steps, hidden_size]`. @@ -651,8 +648,8 @@ def _switch_grad(x, stop=False): return (final_outputs, final_states) -def birnn(cell_fw, cell_bw, inputs, initial_states, sequence_length, - time_major): +def birnn(cell_fw, cell_bw, inputs, initial_states, sequence_length, time_major, + **kwargs): """ birnn creates a bidirectional recurrent neural network specified by RNNCell `cell_fw` and `cell_bw`, which performs :code:`cell.call()` @@ -668,28 +665,25 @@ def birnn(cell_fw, cell_bw, inputs, initial_states, sequence_length, for `time_major == True`. It represents the inputs to be unrolled in RNN. initial_states(tuple, optional): A tuple of - If not provided, `cell.get_initial_states` would be used to produce - the each initial state. Defaults to None. - sequence_length(Tensor, optional): A tensor with shape `[batch_size]`. - It stores real length of each instance, thus enables users to extract - the last valid state when past a batch element's sequence length for - correctness. If not provided, the paddings would be treated same as - non-padding inputs. Default None. - time_major(bool, optional): Indicate the data layout of Tensor included - in `input` and `output` tensors. If `False`, the data layout would - be batch major with shape `[batch_size, time_steps, ...]`. If - `True`, the data layout would be time major with shape - `[time_steps, batch_size, ...]`. Default: `False`. - **kwargs: Additional keyword arguments. Arguments passed to `cell.call`. + If not provided, `cell.get_initial_states` would be called to + produce initial state for each cell. Defaults to None. + sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 + or int32. The valid lengths of input sequences. Defaults to None. + If `sequence_length` is not None, the inputs are treated as + padded sequences. In each input sequence, elements whose time step + index are not less than the valid length are treated as paddings. + time_major (bool): Whether the first dimension of the input means the + time steps. Defaults to False. + **kwargs: Additional keyword arguments to pass to `forward` of each cell. Returns: - outputs (Tensor): A (possibly nested structure of) tensor variable[s], - the outputs of the bidirectional RNN. It is the concatenation - of the outputs for both the forward RNN and backward RNN along - the last axis. - The shape of tensor should be `[batch_size, time_steps, ...]` - for `time_major == False` or `[time_steps, batch_size, ...]` - for `time_major == True`. + (outputs, final_states) + outputs (Tensor): the outputs of the bidirectional RNN. It is the + concatenation of the outputs from the forward RNN and backward + RNN along the last axis. + If time major is True, the shape is `[time_steps, batch_size, size]`, + else the shape is `[batch_size, time_steps, size]`, where size is + `cell_fw.hidden_size + cell_bw.hidden_size`. final_states (tuple): A tuple of the final states of the forward cell and backward cell. @@ -712,14 +706,16 @@ def birnn(cell_fw, cell_bw, inputs, initial_states, sequence_length, inputs, states_fw, sequence_length, - time_major=time_major) + time_major=time_major, + **kwargs) outputs_bw, states_bw = rnn(cell_bw, inputs, states_bw, sequence_length, time_major=time_major, - is_reverse=True) + is_reverse=True, + **kwargs) outputs = map_structure(lambda x, y: tensor.concat([x, y], -1), outputs_fw, outputs_bw) diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py index c24db317622d7..2f5756459709a 100644 --- a/python/paddle/nn/layer/rnn.py +++ b/python/paddle/nn/layer/rnn.py @@ -149,30 +149,35 @@ def get_initial_states(self, batch_ref, shape=None, dtype=None, - init_value=0, + init_value=0., batch_dim_idx=0): r""" Generate initialized states according to provided shape, data type and value. Arguments: - batch_ref: A (possibly nested structure of) tensor variable[s]. - The first dimension of the tensor will be used as batch size to - initialize states. - shape: A (possibly nested structure of) shape[s], where a shape is - represented as a list/tuple of integer). -1(for batch size) will - beautomatically inserted if shape is not started with it. If None, - property `state_shape` will be used. The default value is None. - dtype: A (possibly nested structure of) data type[s]. The structure - must be same as that of `shape`, except when all tensors' in states - has the same data type, a single data type can be used. If None and - property `cell.state_shape` is not available, float32 will be used - as the data type. The default value is None. - init_value: A float value used to initialize states. - batch_dim_idx: An integer indicating which dimension of the tensor in - inputs represents batch size. The default value is 0. + batch_ref (Tensor): A tensor, which shape would be used to + determine the batch size, which is used to generate initial + states. For `batch_ref`'s shape d, `d[batch_dim_idx]` is + treated as batch size. + shape (list|tuple, optional): A (possibly nested structure of) shape[s], + where a shape is a list/tuple of integer). `-1` (for batch size) + will be automatically prepended if a shape does not starts with + it. If None, property `state_shape` will be used. Defaults to + None. + dtype (str|list|tuple, optional): A (possibly nested structure of) + data type[s]. The structure must be same as that of `shape`, + except when all tensors' in states has the same data type, a + single data type can be used. If None and property `cell.state_shape` + is not available, current default floating type of paddle is + used. Defaults to None. + init_value (float, optional): A float value used to initialize states. + Defaults to 0. + batch_dim_idx (int, optional): An integer indicating which + dimension of the of `batch_ref` represents batch. Defaults to 0. Returns: - Variable: tensor variable[s] packed in the same structure provided \ - by shape, representing the initialized states. + init_states (Tensor|tuple|list): tensor of the provided shape and + dtype, or list of tensors that each satisfies the requirements, + packed in the same structure as `shape` and `type` does. """ # TODO: use inputs and batch_size batch_ref = flatten(batch_ref)[0] @@ -209,7 +214,7 @@ def __init__(self, shape): # nested structure of dtypes try: states_dtypes = self.state_dtype if dtype is None else dtype - except NotImplementedError: # use fp32 as default + except NotImplementedError: states_dtypes = framework.get_default_dtype() if len(flatten(states_dtypes)) == 1: dtype = flatten(states_dtypes)[0] @@ -229,8 +234,8 @@ def state_shape(self): r""" Abstract method (property). Used to initialize states. - A (possiblely nested structure of) shape[s], where a shape is represented - as a list/tuple of integers (-1 for batch size would be automatically + A (possiblely nested structure of) shape[s], where a shape is a + list/tuple of integers (-1 for batch size would be automatically inserted into a shape if shape is not started with it). Not necessary to be implemented if states are not initialized by `get_initial_states` or the `shape` argument is provided when using @@ -257,7 +262,8 @@ def state_dtype(self): class SimpleRNNCell(RNNCellBase): r""" - Elman RNN (SimpleRNN) cell. + Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it + computes the outputs and updates states. The formula used is as follows: @@ -274,9 +280,9 @@ class SimpleRNNCell(RNNCellBase): Arguments: input_size (int): The input size. hidden_size (int): The hidden size. - nonlinearity (str): The activation in the SimpleRNN cell. It can be - `tanh` or `relu`. Defaults to `tanh`. - weight_ih_attr(ParamAttr, optional): The parameter attribute for + nonlinearity (str, optional): The activation in the SimpleRNN cell. + It can be `tanh` or `relu`. Defaults to `tanh`. + weight_ih_attr (ParamAttr, optional): The parameter attribute for `weight_ih`. Default: None. weight_hh_attr(ParamAttr, optional): The parameter attribute for `weight_hh`. Default: None. @@ -287,6 +293,37 @@ class SimpleRNNCell(RNNCellBase): name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + Parameters: + weight_ih (Parameter): shape (hidden_size, input_size), input to hidden + weight, corresponding to :math:`W_{ih}` in the formula. + weight_hh (Parameter): shape (hidden_size, hidden_size), hidden to + hidden weight, corresponding to :math:`W_{hh}` in the formula. + bias_ih (Parameter): shape (hidden_size, ), input to hidden bias, + corresponding to :math:`b_{ih}` in the formula. + bias_hh (Parameter): shape (hidden_size, ), hidden to hidden bias, + corresponding to :math:`b_{hh}` in the formula. + + Inputs: + inputs (Tensor): shape `[batch_size, input_size]`, the input, + corresponding to :math:`x_t` in the formula. + states (Tensor, optional): shape `[batch_size, hidden_size]`, the + previous hidden state, corresponding to :math:`h_{t-1}` in the + formula. When states is None, zero state is used. Defaults to + None. + + Returns: + (outputs, new_states) + outputs (Tensor): shape `[batch_size, hidden_size]`, the output, + corresponding to :math:`h_{t}` in the formula. + states (Tensor): shape `[batch_size, hidden_size]`, the new hidden + state, corresponding to :math:`h_{t}` in the formula. + + Notes: + All the weights and bias are initialized with `Uniform(-std, std)` by + default. Where std = :math:`\frac{1}{\sqrt{hidden_size}}`. For more + information about parameter initialization, please refer to + :ref:`api_fluid_ParamAttr`. + Examples: .. code-block:: python @@ -344,24 +381,6 @@ def __init__(self, else F.relu def forward(self, inputs, states=None): - r""" - Given the input and previous atate, compute the output and update state. - - Arguments: - inputs (Tensor): shape `[batch_size, input_size]`, the input, - corresponding to :math:`x_t` in the formula. - states (Tensor, optional): shape `[batch_size, hidden_size]`, the - previous hidden state, corresponding to :math:`h_{t-1}` in the - formula. When states is None, zero state is used. Defaults to - None. - Returns: - (outputs, new_states) - outputs (Tensor): shape `[batch_size, hidden_size]`, the output, - corresponding to :math:`h_{t}` in the formula. - states (Tensor): shape `[batch_size, hidden_size]`, the new hidden - state, corresponding to :math:`h_{t}` in the formula. - - """ if states is None: states = self.get_initial_states(inputs, self.state_shape) pre_h = states @@ -381,7 +400,8 @@ def state_shape(self): class LSTMCell(RNNCellBase): r""" - Long-Short Term Memory(LSTM) RNN cell. + Long-Short Term Memory(LSTM) RNN cell. Given the inputs and previous states, + it computes the outputs and updates states. The formula used is as follows: @@ -414,6 +434,42 @@ class LSTMCell(RNNCellBase): name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + Parameters: + weight_ih (Parameter): shape (4 * hidden_size, input_size), input to + hidden weight, which corresponds to the concatenation of + :math:`W_{ii}, W_{if}, W_{ig}, W_{io}` in the formula. + weight_hh (Parameter): shape (4 * hidden_size, hidden_size), hidden to + hidden weight, which corresponds to the concatenation of + :math:`W_{hi}, W_{hf}, W_{hg}, W_{ho}` in the formula. + bias_ih (Parameter): shape (4 * hidden_size, ), input to hidden bias, + which corresponds to the concatenation of + :math:`b_{ii}, b_{if}, b_{ig}, b_{io}` in the formula. + bias_hh (Parameter): shape (4 * hidden_size, ), hidden to hidden bias, + which corresponds to the concatenation of + :math:`b_{hi}, b_{hf}, b_{hg}, b_{ho}` in the formula. + + Inputs: + inputs (Tensor): shape `[batch_size, input_size]`, the input, + corresponding to :math:`x_t` in the formula. + states (tuple, optional): a tuple of two tensors, each of shape + `[batch_size, hidden_size]`, the previous hidden state, + corresponding to :math:`h_{t-1}, c_{t-1}` in the formula. + When states is None, zero state is used. Defaults to None. + + Returns: + (outputs, new_states) + outputs (Tensor): shape `[batch_size, hidden_size]`, the output, + corresponding to :math:`h_{t}` in the formula. + states (tuple): a tuple of two tensors, each of shape + `[batch_size, hidden_size]`, the new hidden states, + corresponding to :math:`h_{t}, c{t}` in the formula. + + Notes: + All the weights and bias are initialized with `Uniform(-std, std)` by + default. Where std = :math:`\frac{1}{\sqrt{hidden_size}}`. For more + information about parameter initialization, please refer to + :ref:`api_fluid_ParamAttr`. + Examples: .. code-block:: python @@ -464,25 +520,6 @@ def __init__(self, self._activation = paddle.tanh def forward(self, inputs, states=None): - r""" - Given the input and previous atate, compute the output and update state. - - Arguments: - inputs (Tensor): shape `[batch_size, input_size]`, the input, - corresponding to :math:`x_t` in the formula. - states (tuple, optional): a tuple of two tensors, each of shape - `[batch_size, hidden_size]`, the previous hidden state, - corresponding to :math:`h_{t-1}, c_{t-1}` in the formula. - When states is None, zero state is used. Defaults to None. - Returns: - (outputs, new_states) - outputs (Tensor): shape `[batch_size, hidden_size]`, the output, - corresponding to :math:`h_{t}` in the formula. - states (tuple): a tuple of two tensors, each of shape - `[batch_size, hidden_size]`, the new hidden states, - corresponding to :math:`h_{t}, c{t}` in the formula. - - """ if states is None: states = self.get_initial_states(inputs, self.state_shape) pre_hidden, pre_cell = states @@ -516,7 +553,8 @@ def state_shape(self): class GRUCell(RNNCellBase): r""" - Gated Recurrent Unit (GRU) RNN cell. + Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states, + it computes the outputs and updates states. The formula for GRU used is as follows: @@ -548,6 +586,39 @@ class GRUCell(RNNCellBase): name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + Parameters: + weight_ih (Parameter): shape (3 * hidden_size, input_size), input to + hidden weight, which corresponds to the concatenation of + :math:`W_{ir}, W_{iz}, W_{ic}` in the formula. + weight_hh (Parameter): shape (3 * hidden_size, hidden_size), hidden to + hidden weight, which corresponds to the concatenation of + :math:`W_{hr}, W_{hz}, W_{hc}` in the formula. + bias_ih (Parameter): shape (3 * hidden_size, ), input to hidden bias, + which corresponds to the concatenation of + :math:`b_{ir}, b_{iz}, b_{ic}` in the formula. + bias_hh (Parameter): shape (3 * hidden_size, ), hidden to hidden bias, + which corresponds to the concatenation of + :math:`b_{hr}, b_{hz}, b_{hc}` in the formula. + + Inputs: + inputs (Tensor): A tensor with shape `[batch_size, input_size]`, + corresponding to :math:`x_t` in the formula. + states (Tensor): A tensor with shape `[batch_size, hidden_size]`. + corresponding to :math:`h_{t-1}` in the formula. + + Returns: + (outputs, new_states) + outputs (Tensor): shape `[batch_size, hidden_size]`, the output, + corresponding to :math:`h_{t}` in the formula. + states (Tensor): shape `[batch_size, hidden_size]`, the new hidden + state, corresponding to :math:`h_{t}` in the formula. + + Notes: + All the weights and bias are initialized with `Uniform(-std, std)` by + default. Where std = :math:`\frac{1}{\sqrt{hidden_size}}`. For more + information about parameter initialization, please refer to + :ref:`api_fluid_ParamAttr`. + Examples: .. code-block:: python @@ -598,23 +669,6 @@ def __init__(self, self._activation = paddle.tanh def forward(self, inputs, states=None): - r""" - Performs single step GRU calculations. - - Parameters: - inputs (Variable): A tensor with shape `[batch_size, input_size]`, - corresponding to :math:`x_t` in the formula. The data type - should be float32 or float64. - states (Variable): A tensor with shape `[batch_size, hidden_size]`. - corresponding to :math:`h_{t-1}` in the formula. The data type - should be float32 or float64. - - Returns: - tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` and \ - `new_states` is the same tensor shaped `[batch_size, hidden_size]`, \ - corresponding to :math:`h_t` in the formula. The data type of the \ - tensor is same as that of `states`. - """ if states is None: states = self.get_initial_states(inputs, self.state_shape) @@ -641,55 +695,57 @@ def state_shape(self): r""" The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch size would be automatically inserted into shape). The shape corresponds - to :math:`h_{t-1}`. + to the shape of :math:`h_{t-1}`. """ return (self.hidden_size, ) class RNN(Layer): r""" - Wrapper for RNN, which creates a recurrent neural network specified with a - RNN cell. It performs :code:`cell.forward()` repeatedly until reaches to - the maximum length of `inputs`. + Wrapper for RNN, which creates a recurrent neural network with an RNN cell. + It performs :code:`cell.forward()` repeatedly until reaches to the maximum + length of `inputs`. Arguments: - cell(RNNCellBase): An instance of `RNNCell`. + cell(RNNCellBase): An instance of `RNNCellBase`. is_reverse (bool, optional): Indicate whether to calculate in the reverse order of input sequences. Defaults to False. - time_major (bool, optional): Indicate the data layout of Tensor included - in `input` and `output` tensors. If `False`, the data layout would - be batch major with shape `[batch_size, time_steps, ...]`. If - `True`, the data layout would be time major with shape - `[time_steps, batch_size, ...]`. Defaults to False. + time_major (bool): Whether the first dimension of the input means the + time steps. Defaults to False. Inputs: - inputs (Tensor): A (possibly nested structure of) tensor variable[s]. - The shape of tensor should be `[batch_size, time_steps, ...]` - for `time_major == False` or `[time_steps, batch_size, ...]` - for `time_major == True`. It represents the inputs to be unrolled - in RNN. - initial_states (Tensor|list|tuple, optional): A (possibly nested structure of) - tensor[s], representing the initial state for the rnn cell. - If not provided, `cell.get_initial_states` would be used to produce - the initial state. Defaults to None. + inputs (Tensor): A (possibly nested structure of) tensor[s]. The input + sequences. + If time major is True, the shape is `[batch_size, time_steps, input_size]` + If time major is False, the shape is [time_steps, batch_size, input_size]` + where `input_size` is the input size of the cell. + initial_states (Tensor|list|tuple, optional): Tensor of a possibly + nested structure of tensors, representing the initial state for + the rnn cell. If not provided, `cell.get_initial_states` would be + called to produce the initial states. Defaults to None. sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 - or int32. The valid lengths of input sequences. + or int32. The valid lengths of input sequences. Defaults to None. If `sequence_length` is not None, the inputs are treated as - padded sequences. In each input sequence, elements whos time step + padded sequences. In each input sequence, elements whose time step index are not less than the valid length are treated as paddings. - **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`. + **kwargs: Additional keyword arguments to pass to `forward` of the cell. - Outputs: + Returns: (outputs, final_states) - outputs (Tensor|list|tuple): the output sequence. Tensor or nested - structure of Tensor. - If `time_major` is True, the shape of each tensor in outpus is + outputs (Tensor|list|tuple): the output sequences. + If `time_major` is True, the shape is `[time_steps, batch_size, hidden_size]`, else `[batch_size, time_steps, hidden_size]`. - final_states (Tensor|list|tuple): final states. A (possibly nested structure of) - tensor[s], representing the final state for RNN. It has the same - structure of intial state. Each tensor in final states has the same - shape and dtype as the corresponding tensor in initial states. + final_states (Tensor|list|tuple): final states of the cell. Tensor or + a possibly nested structure of tensors which has the same structure + with intial state. Each tensor in final states has the same shape + and dtype as the corresponding tensor in initial states. + + Notes: + This class is a low level API for wrapping rnn cell into a RNN network. + Users should take care of the state of the cell. If `initial_states` is + passed to the `forward` method, make sure that it satisfies the + requirements of the cell. Examples: @@ -716,7 +772,11 @@ def __init__(self, cell, is_reverse=False, time_major=False): self.is_reverse = is_reverse self.time_major = time_major - def forward(self, inputs, initial_states=None, sequence_length=None): + def forward(self, + inputs, + initial_states=None, + sequence_length=None, + **kwargs): if initial_states is None: initial_states = self.cell.get_initial_states( batch_ref=inputs, @@ -728,48 +788,58 @@ def forward(self, inputs, initial_states=None, sequence_length=None): initial_states=initial_states, sequence_length=sequence_length, time_major=self.time_major, - is_reverse=self.is_reverse) + is_reverse=self.is_reverse, + **kwargs) return final_outputs, final_states class BiRNN(Layer): r""" - Wrapper for bidirectional RNN. It assembles two RNN cells by performing - forward and backward RNN separately, and concat outputs. + Wrapper for bidirectional RNN, which builds a bidiretional RNN given the + forward rnn cell and backward rnn cell. A BiRNN applies forward RNN and + backward RNN with coresponding cells separately and concats the outputs + along the last axis. - Parameters: - cell_fw (RNNCellBase): A RNNCell instance used for forward RNN. - cell_bw (RNNCellBase): A RNNCell instance used for backward RNN. + Arguments: + cell_fw (RNNCellBase): A RNNCellBase instance used for forward RNN. + cell_bw (RNNCellBase): A RNNCellBase instance used for backward RNN. time_major (bool): Whether the first dimension of the input means the - time steps. + time steps. Defaults to False. Inputs: - inputs (Tensor): A (possibly nested structure of) tensor variable[s]. - The shape of tensor should be `[batch_size, sequence_length, ...]` - for `time_major == False` or `[sequence_length, batch_size, ...]` - for `time_major == True`. It represents the inputs to be unrolled - in both forward and backward RNN. - initial_states (list|tuple, optional): A tuple of the initial states of - the forward cell and backward cell. - If not provided, `cell.get_initial_states` would be used to produce - the initial states. Defaults to None. + inputs (Tensor): the input sequences of both RNN. + If time_major is True, the shape of is + `[time_steps, batch_size, input_size]`, else the shape is + `[batch_size, time_steps, input_size]`, where input_size is the + input size of both cells. + initial_states (list|tuple, optional): A tuple/list of the initial + states of the forward cell and backward cell. Defaults to None. + If not provided, `cell.get_initial_states` would be called to + produce the initial states for each cell. Defaults to None. sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 - or int32. The valid lengths of input sequences. + or int32. The valid lengths of input sequences. Defaults to None. If `sequence_length` is not None, the inputs are treated as - padded sequences. In each input sequence, elements whos time step + padded sequences. In each input sequence, elements whose time step index are not less than the valid length are treated as paddings. - **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`. + **kwargs: Additional keyword arguments. Arguments passed to `forward` + for each cell. Outputs: - outputs (Tensor): A (possibly nested structure of) tensor variable[s], - the outputs of the bidirectional RNN. It is the concatenation - of the outputs for both the forward RNN and backward RNN along - the last axis. - The shape of tensor should be `[batch_size, time_steps, ...]` - for `time_major == False` or `[time_steps, batch_size, ...]` - for `time_major == True`. - final_states (tuple): A tuple of the final states of the forward - cell and backward cell. + (outputs, final_states) + outputs (Tensor): the outputs of the bidirectional RNN. It is the + concatenation of the outputs from the forward RNN and backward + RNN along the last axis. + If time major is True, the shape is `[time_steps, batch_size, size]`, + else the shape is `[batch_size, time_steps, size]`, where size is + `cell_fw.hidden_size + cell_bw.hidden_size`. + final_states (tuple): A tuple of the final states of the forward + cell and backward cell. + + Notes: + This class is a low level API for wrapping rnn cells into a BiRNN + network. Users should take care of the states of the cells. + If `initial_states` is passed to the `forward` method, make sure that + it satisfies the requirements of the cells. Examples: @@ -791,6 +861,10 @@ def __init__(self, cell_fw, cell_bw, time_major=False): super(BiRNN, self).__init__() self.cell_fw = cell_fw self.cell_bw = cell_bw + if cell_fw.input_size != cell_bw.input_size: + raise ValueError("input size of forward cell({}) does not equals" + "that of backward cell({})".format( + cell_fw.input_size, cell_bw.input_size)) for cell in [self.cell_fw, self.cell_bw]: if not hasattr(cell, "call"): # for non-dygraph mode, `rnn` api uses cell.call @@ -810,13 +884,13 @@ def forward(self, outputs, final_states = F.birnn(self.cell_fw, self.cell_bw, inputs, initial_states, sequence_length, - self.time_major) + self.time_major, **kwargs) return outputs, final_states class RNNMixin(LayerList): r""" - A Mixin class for RNN networks. It provides forward method for SimpleRNN, + A Mixin class for RNN networks. It provides `forward` method for SimpleRNN, LSTM and GRU. """ @@ -843,7 +917,10 @@ def forward(self, inputs, initial_states=None, sequence_length=None): for i, rnn_layer in enumerate(self): if i > 0: inputs = F.dropout( - inputs, self.dropout, mode="upscale_in_train") + inputs, + self.dropout, + training=self.training, + mode="upscale_in_train") outputs, final_state = rnn_layer(inputs, states[i], sequence_length) final_states.append(final_state) inputs = outputs @@ -855,14 +932,14 @@ def forward(self, inputs, initial_states=None, sequence_length=None): class SimpleRNN(RNNMixin): r""" - Multilayer Elman network(SimpleRNN). It takes a sequence and an initial - state as inputs, and returns the output sequence and the final state. + Multilayer Elman network(SimpleRNN). It takes input sequences and initial + states as inputs, and returns the output sequences and the final states. - Each layer inside the SimpleRNN maps the input sequence and initial state - to the output sequence and final state in the following manner: at each - step, it takes step input(:math:`x_{t}`) and previous - state(:math:`h_{t-1}`) as inputs, and returns step output(:math:`y_{t}`) - and new state(:math:`h_{t}`). + Each layer inside the SimpleRNN maps the input sequences and initial states + to the output sequences and final states in the following manner: at each + step, it takes step inputs(:math:`x_{t}`) and previous + states(:math:`h_{t-1}`) as inputs, and returns step outputs(:math:`y_{t}`) + and new states(:math:`h_{t}`). .. math:: @@ -875,23 +952,23 @@ class SimpleRNN(RNNMixin): Arguments: input_size (int): The input size for the first layer's cell. hidden_size (int): The hidden size for each layer's cell. - num_layers (int): Number of layers. Defaults to 1. - nonlinearity (str): The activation in each SimpleRNN cell. It can be + num_layers (int, optional): Number of layers. Defaults to 1. + nonlinearity (str, optional): The activation in each SimpleRNN cell. It can be `tanh` or `relu`. Defaults to `tanh`. - direction (str): The direction of the network. It can be "forward", + direction (str, optional): The direction of the network. It can be "forward", "backward" and "bidirectional". Defaults to "forward". - dropout (float): The droput probability. Dropout is applied to the - input of each layer except for the first layer. - time_major (bool): Whether the first dimension of the input means the - time steps. + dropout (float, optional): The droput probability. Dropout is applied to the + input of each layer except for the first layer. Defaults to 0. + time_major (bool, optional): Whether the first dimension of the input means the + time steps. Defaults to False. weight_ih_attr (ParamAttr, optional): The parameter attribute for - `weight_ih` of each cell. Default: None. + `weight_ih` of each cell. Defaults to None. weight_hh_attr (ParamAttr, optional): The parameter attribute for - `weight_hh` of each cell. Default: None. + `weight_hh` of each cell. Defaults to None. bias_ih_attr (ParamAttr, optional): The parameter attribute for the - `bias_ih` of each cells. Default: None. + `bias_ih` of each cells. Defaults to None. bias_ih_attr (ParamAttr, optional): The parameter attribute for the - `bias_hh` of each cells. Default: None. + `bias_hh` of each cells. Defaults to None. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. @@ -903,18 +980,24 @@ class SimpleRNN(RNNMixin): `[num_lauers * num_directions, batch_size, hidden_size]`. If initial_state is not given, zero initial states are used. sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 - or int32. The valid lengths of input sequences. + or int32. The valid lengths of input sequences. Defaults to None. If `sequence_length` is not None, the inputs are treated as - padded sequences. In each input sequence, elements whos time step + padded sequences. In each input sequence, elements whose time step index are not less than the valid length are treated as paddings. - Outputs: + Returns: (outputs, final_states) outputs (Tensor): the output sequence. - If `time_major` is True, the shape is `[time_steps, batch_size, hidden_size]`, - else, the shape is `[batch_size, time_steps, hidden_size]`. + If `time_major` is True, the shape is + `[time_steps, batch_size, num_directions * hidden_size]`, + else, the shape is + `[batch_size, time_steps, num_directions * hidden_size]`. + Note that `num_directions` is 2 if direction is "bidirectional" + else 1. final_states (Tensor): final states. The shape is `[num_lauers * num_directions, batch_size, hidden_size]`. + Note that `num_directions` is 2 if direction is "bidirectional" + else 1. Examples: @@ -990,13 +1073,13 @@ def __init__(self, class LSTM(RNNMixin): r""" Multilayer LSTM. It takes a sequence and an initial state as inputs, and - returns the output sequence and the final state. + returns the output sequences and the final states. - Each layer inside the LSTM maps the input sequence and initial state - to the output sequence and final state in the following manner: at each - step, it takes step input(:math:`x_{t}`) and previous - state(:math:`h_{t-1}, c_{t-1}`) as inputs, and returns step - output(:math:`y_{t}`) and new state(:math:`h_{t}, c_{t}`). + Each layer inside the LSTM maps the input sequences and initial states + to the output sequences and final states in the following manner: at each + step, it takes step inputs(:math:`x_{t}`) and previous + states(:math:`h_{t-1}, c_{t-1}`) as inputs, and returns step + outputs(:math:`y_{t}`) and new states(:math:`h_{t}, c_{t}`). .. math:: @@ -1014,13 +1097,13 @@ class LSTM(RNNMixin): Arguments: input_size (int): The input size for the first layer's cell. hidden_size (int): The hidden size for each layer's cell. - num_layers (int): Number of layers. Defaults to 1. - direction (str): The direction of the network. It can be "forward", - "backward" and "bidirectional". Defaults to "forward". - dropout (float): The droput probability. Dropout is applied to the - input of each layer except for the first layer. - time_major (bool): Whether the first dimension of the input means the - time steps. + num_layers (int, optional): Number of layers. Defaults to 1. + direction (str, optional): The direction of the network. It can be + "forward", "backward" and "bidirectional". Defaults to "forward". + dropout (float, optional): The droput probability. Dropout is applied + to the input of each layer except for the first layer. Defaults to 0. + time_major (bool, optional): Whether the first dimension of the input + means the time steps. Defaults to False. weight_ih_attr (ParamAttr, optional): The parameter attribute for `weight_ih` of each cell. Default: None. weight_hh_attr (ParamAttr, optional): The parameter attribute for @@ -1040,18 +1123,25 @@ class LSTM(RNNMixin): the shape of each is `[num_lauers * num_directions, batch_size, hidden_size]`. If initial_state is not given, zero initial states are used. sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 - or int32. The valid lengths of input sequences. + or int32. The valid lengths of input sequences. Defaults to None. If `sequence_length` is not None, the inputs are treated as padded sequences. In each input sequence, elements whos time step index are not less than the valid length are treated as paddings. - Outputs: + Returns: (outputs, final_states) outputs (Tensor): the output sequence. - If `time_major` is True, the shape is `[time_steps, batch_size, hidden_size]`, - else, the shape is `[batch_size, time_steps, hidden_size]`. - final_states (Tensor): the final state, a tuple of (h, c), - the shape of each is `[num_lauers * num_directions, batch_size, hidden_size]`. + If `time_major` is True, the shape is + `[time_steps, batch_size, num_directions * hidden_size]`, + If `time_major` is False, the shape is + `[batch_size, time_steps, num_directions * hidden_size]`. + Note that `num_directions` is 2 if direction is "bidirectional" + else 1. + final_states (Tensor): the final state, a tuple of two tensors, h and c. + The shape of each is + `[num_lauers * num_directions, batch_size, hidden_size]`. + Note that `num_directions` is 2 if direction is "bidirectional" + else 1. Examples: @@ -1120,14 +1210,14 @@ def __init__(self, class GRU(RNNMixin): r""" - Multilayer GRU. It takes a sequence and an initial state as inputs, and - returns the output sequence and the final state. + Multilayer GRU. It takes input sequencse and initial states as inputs, and + returns the output sequences and the final states. - Each layer inside the GRU maps the input sequence and initial state - to the output sequence and final state in the following manner: at each - step, it takes step input(:math:`x_{t}`) and previous - state(:math:`h_{t-1}`) as inputs, and returns step output(:math:`y_{t}`) - and new state(:math:`h_{t}`). + Each layer inside the GRU maps the input sequences and initial states + to the output sequences and final states in the following manner: at each + step, it takes step inputs(:math:`x_{t}`) and previous + states(:math:`h_{t-1}`) as inputs, and returns step outputs(:math:`y_{t}`) + and new states(:math:`h_{t}`). .. math:: @@ -1143,13 +1233,13 @@ class GRU(RNNMixin): Arguments: input_size (int): The input size for the first layer's cell. hidden_size (int): The hidden size for each layer's cell. - num_layers (int): Number of layers. Defaults to 1. - direction (str): The direction of the network. It can be "forward", - "backward" and "bidirectional". Defaults to "forward". - dropout (float): The droput probability. Dropout is applied to the - input of each layer except for the first layer. - time_major (bool): Whether the first dimension of the input means the - time steps. + num_layers (int, optional): Number of layers. Defaults to 1. + direction (str, optional): The direction of the network. It can be + "forward", "backward" and "bidirectional". Defaults to "forward". + dropout (float, optional): The droput probability. Dropout is applied + to the input of each layer except for the first layer. Defaults to 0. + time_major (bool, optional): Whether the first dimension of the input + means the time steps. Defaults to False. weight_ih_attr (ParamAttr, optional): The parameter attribute for `weight_ih` of each cell. Default: None. weight_hh_attr (ParamAttr, optional): The parameter attribute for @@ -1167,20 +1257,27 @@ class GRU(RNNMixin): else, the shape is `[batch_size, time_steps, hidden_size]`. initial_states (Tensor, optional): the initial state. The shape is `[num_lauers * num_directions, batch_size, hidden_size]`. - If initial_state is not given, zero initial states are used. + If initial_state is not given, zero initial states are used. + Defaults to None. sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 - or int32. The valid lengths of input sequences. + or int32. The valid lengths of input sequences. Defaults to None. If `sequence_length` is not None, the inputs are treated as padded sequences. In each input sequence, elements whos time step index are not less than the valid length are treated as paddings. - Outputs: + Returns: (outputs, final_states) outputs (Tensor): the output sequence. - If `time_major` is True, the shape is `[time_steps, batch_size, hidden_size]`, - else, the shape is `[batch_size, time_steps, hidden_size]`. + If `time_major` is True, the shape is + `[time_steps, batch_size, num_directions * hidden_size]`, + else, the shape is + `[batch_size, time_steps, num_directions * hidden_size]`. + Note that `num_directions` is 2 if direction is "bidirectional" + else 1. final_states (Tensor): final states. The shape is `[num_lauers * num_directions, batch_size, hidden_size]`. + Note that `num_directions` is 2 if direction is "bidirectional" + else 1. Examples: From d843db8d783ba1bf77c06ea708229b35ba24aa86 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Wed, 26 Aug 2020 16:40:22 +0800 Subject: [PATCH 12/14] update docstrings for birnn --- python/paddle/fluid/layers/rnn.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py index d1b0e4961138c..632569fa4fbe3 100644 --- a/python/paddle/fluid/layers/rnn.py +++ b/python/paddle/fluid/layers/rnn.py @@ -658,13 +658,14 @@ def birnn(cell_fw, cell_bw, inputs, initial_states, sequence_length, time_major, along the last axis. Arguments: - cell(RNNCellBase): An instance of `RNNCellBase`. - inputs(Tensor): A (possibly nested structure of) tensor[s]. - The shape of tensor should be `[batch_size, sequence_length, ...]` - for `time_major == False` or `[sequence_length, batch_size, ...]` - for `time_major == True`. It represents the inputs to be unrolled - in RNN. - initial_states(tuple, optional): A tuple of + cell_fw(RNNCellBase): An instance of `RNNCellBase`. + cell_bw(RNNCellBase): An instance of `RNNCellBase`. + inputs(Tensor): the input sequences. + If time_major is True, the shape is + `[time_steps, batch_size, input_size]` + else the shape is `[batch_size, time_steps, input_size]`. + initial_states(tuple, optional): A tuple of initial states of + `cell_fw` and `cell_bw`. If not provided, `cell.get_initial_states` would be called to produce initial state for each cell. Defaults to None. sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 From d0f9fba010edeffec711746b8cfaaf95fb58fcd3 Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Thu, 27 Aug 2020 09:45:55 +0800 Subject: [PATCH 13/14] rename argument for SimpleRNN and SimpleRNNCell, fix sample code --- python/paddle/fluid/layers/rnn.py | 28 ++++++++++++++----- python/paddle/nn/layer/rnn.py | 45 ++++++++++++++++--------------- 2 files changed, 44 insertions(+), 29 deletions(-) diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py index 632569fa4fbe3..85de86a42c0ad 100644 --- a/python/paddle/fluid/layers/rnn.py +++ b/python/paddle/fluid/layers/rnn.py @@ -648,7 +648,12 @@ def _switch_grad(x, stop=False): return (final_outputs, final_states) -def birnn(cell_fw, cell_bw, inputs, initial_states, sequence_length, time_major, +def birnn(cell_fw, + cell_bw, + inputs, + initial_states, + sequence_length=None, + time_major=False, **kwargs): """ birnn creates a bidirectional recurrent neural network specified by @@ -686,8 +691,7 @@ def birnn(cell_fw, cell_bw, inputs, initial_states, sequence_length, time_major, else the shape is `[batch_size, time_steps, size]`, where size is `cell_fw.hidden_size + cell_bw.hidden_size`. final_states (tuple): A tuple of the final states of the forward - cell and backward cell. - + cell and backward cell. Examples: @@ -696,12 +700,22 @@ def birnn(cell_fw, cell_bw, inputs, initial_states, sequence_length, time_major, import paddle paddle.disable_static() - cell_fw = LSTMCell(16, 32) - cell_bw = LSTMCell(16, 32) - inputs = paddle.rand((2, 23, 16)) - outputs, final_states = paddle.nn.functional.birnn(cell_fw, cell_bw, inputs) + cell_fw = paddle.nn.LSTMCell(16, 32) + cell_bw = paddle.nn.LSTMCell(16, 32) + + inputs = paddle.rand((4, 23, 16)) + hf, cf = paddle.rand((4, 32)), paddle.rand((4, 32)) + hb, cb = paddle.rand((4, 32)), paddle.rand((4, 32)) + initial_states = ((hf, cf), (hb, cb)) + outputs, final_states = paddle.nn.functional.birnn( + cell_fw, cell_bw, inputs, initial_states) """ + if initial_states is None: + state_fw = cell_fw.get_initial_states( + batch_ref=inputs, batch_dim_idx=1 if time_major else 0) + state_bw = cell_fw.get_initial_states( + batch_ref=inputs, batch_dim_idx=1 if time_major else 0) states_fw, states_bw = initial_states outputs_fw, states_fw = rnn(cell_fw, inputs, diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py index 2f5756459709a..6f1c5f199ac99 100644 --- a/python/paddle/nn/layer/rnn.py +++ b/python/paddle/nn/layer/rnn.py @@ -280,7 +280,7 @@ class SimpleRNNCell(RNNCellBase): Arguments: input_size (int): The input size. hidden_size (int): The hidden size. - nonlinearity (str, optional): The activation in the SimpleRNN cell. + activation (str, optional): The activation in the SimpleRNN cell. It can be `tanh` or `relu`. Defaults to `tanh`. weight_ih_attr (ParamAttr, optional): The parameter attribute for `weight_ih`. Default: None. @@ -342,7 +342,7 @@ class SimpleRNNCell(RNNCellBase): def __init__(self, input_size, hidden_size, - nonlinearity="tanh", + activation="tanh", weight_ih_attr=None, weight_hh_attr=None, bias_ih_attr=None, @@ -371,13 +371,13 @@ def __init__(self, self.input_size = input_size self.hidden_size = hidden_size - if nonlinearity not in ["tanh", "relu"]: + if activation not in ["tanh", "relu"]: raise ValueError( - "nonlinearity for SimpleRNNCell should be tanh or relu, " - "but get {}".format(nonlinearity)) - self.nonlinearity = nonlinearity - self._nonlinear_fn = paddle.tanh \ - if nonlinearity == "tanh" \ + "activation for SimpleRNNCell should be tanh or relu, " + "but get {}".format(activation)) + self.activation = activation + self._activation_fn = paddle.tanh \ + if activation == "tanh" \ else F.relu def forward(self, inputs, states=None): @@ -390,7 +390,7 @@ def forward(self, inputs, states=None): h2h = paddle.matmul(pre_h, self.weight_hh, transpose_y=True) if self.bias_hh is not None: h2h += self.bias_hh - h = self._nonlinear_fn(i2h + h2h) + h = self._activation_fn(i2h + h2h) return h, h @property @@ -479,9 +479,10 @@ class LSTMCell(RNNCellBase): x = paddle.randn((4, 16)) prev_h = paddle.randn((4, 32)) + prev_c = paddle.randn((4, 32)) cell = paddle.nn.LSTMCell(16, 32) - y, h = cell(x, prev_h) + y, (h, c) = cell(x, (prev_h, prev_c)) """ @@ -758,7 +759,7 @@ class RNN(Layer): prev_h = paddle.randn((4, 32)) cell = paddle.nn.SimpleRNNCell(16, 32) - rnn = paddle.RNN(cell) + rnn = paddle.nn.RNN(cell) outputs, final_states = rnn(inputs, prev_h) """ @@ -848,9 +849,9 @@ class BiRNN(Layer): import paddle paddle.disable_static() - cell_fw = LSTMCell(16, 32) - cell_bw = LSTMCell(16, 32) - rnn = BidirectionalRNN(cell_fw, cell_bw) + cell_fw = paddle.nn.LSTMCell(16, 32) + cell_bw = paddle.nn.LSTMCell(16, 32) + rnn = paddle.nn.BiRNN(cell_fw, cell_bw) inputs = paddle.rand((2, 23, 16)) outputs, final_states = rnn(inputs) @@ -953,7 +954,7 @@ class SimpleRNN(RNNMixin): input_size (int): The input size for the first layer's cell. hidden_size (int): The hidden size for each layer's cell. num_layers (int, optional): Number of layers. Defaults to 1. - nonlinearity (str, optional): The activation in each SimpleRNN cell. It can be + activation (str, optional): The activation in each SimpleRNN cell. It can be `tanh` or `relu`. Defaults to `tanh`. direction (str, optional): The direction of the network. It can be "forward", "backward" and "bidirectional". Defaults to "forward". @@ -1018,7 +1019,7 @@ def __init__(self, input_size, hidden_size, num_layers=1, - nonlinearity="tanh", + activation="tanh", direction="forward", dropout=0., time_major=False, @@ -1031,29 +1032,29 @@ def __init__(self, if direction in ["forward", "backward"]: is_reverse = direction == "backward" - cell = SimpleRNNCell(input_size, hidden_size, nonlinearity, + cell = SimpleRNNCell(input_size, hidden_size, activation, weight_ih_attr, weight_hh_attr, bias_ih_attr, bias_hh_attr) self.append(RNN(cell, is_reverse, time_major)) for i in range(1, num_layers): - cell = SimpleRNNCell(hidden_size, hidden_size, nonlinearity, + cell = SimpleRNNCell(hidden_size, hidden_size, activation, weight_ih_attr, weight_hh_attr, bias_ih_attr, bias_hh_attr) self.append(RNN(cell, is_reverse, time_major)) elif direction == "bidirectional": - cell_fw = SimpleRNNCell(input_size, hidden_size, nonlinearity, + cell_fw = SimpleRNNCell(input_size, hidden_size, activation, weight_ih_attr, weight_hh_attr, bias_ih_attr, bias_hh_attr) - cell_bw = SimpleRNNCell(input_size, hidden_size, nonlinearity, + cell_bw = SimpleRNNCell(input_size, hidden_size, activation, weight_ih_attr, weight_hh_attr, bias_ih_attr, bias_hh_attr) self.append(BiRNN(cell_fw, cell_bw, time_major)) for i in range(1, num_layers): cell_fw = SimpleRNNCell( - 2 * hidden_size, hidden_size, nonlinearity, weight_ih_attr, + 2 * hidden_size, hidden_size, activation, weight_ih_attr, weight_hh_attr, bias_ih_attr, bias_hh_attr) cell_bw = SimpleRNNCell( - 2 * hidden_size, hidden_size, nonlinearity, weight_ih_attr, + 2 * hidden_size, hidden_size, activation, weight_ih_attr, weight_hh_attr, bias_ih_attr, bias_hh_attr) self.append(BiRNN(cell_fw, cell_bw, time_major)) else: From db45fa3aa30376d24c64de0406aec64e60d7859c Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Thu, 27 Aug 2020 15:34:36 +0800 Subject: [PATCH 14/14] add default value for initial_states in fluid.layers.birnn --- python/paddle/fluid/layers/rnn.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py index 85de86a42c0ad..fe8ed83923e88 100644 --- a/python/paddle/fluid/layers/rnn.py +++ b/python/paddle/fluid/layers/rnn.py @@ -651,7 +651,7 @@ def _switch_grad(x, stop=False): def birnn(cell_fw, cell_bw, inputs, - initial_states, + initial_states=None, sequence_length=None, time_major=False, **kwargs): @@ -712,11 +712,12 @@ def birnn(cell_fw, """ if initial_states is None: - state_fw = cell_fw.get_initial_states( + states_fw = cell_fw.get_initial_states( batch_ref=inputs, batch_dim_idx=1 if time_major else 0) - state_bw = cell_fw.get_initial_states( + states_bw = cell_fw.get_initial_states( batch_ref=inputs, batch_dim_idx=1 if time_major else 0) - states_fw, states_bw = initial_states + else: + states_fw, states_bw = initial_states outputs_fw, states_fw = rnn(cell_fw, inputs, states_fw,