From 33d14b1afb8dc3ae41f62842f51d52adecef3432 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Thu, 1 Mar 2018 11:49:35 +0800 Subject: [PATCH 1/9] Add simple wrapper for beam search. Provide a simple seq2seq model. --- fluid/rnn_beam_search/beam_search.py | 205 +++++++++++++++++++++++ fluid/rnn_beam_search/simple_seq2seq.py | 211 ++++++++++++++++++++++++ 2 files changed, 416 insertions(+) create mode 100644 fluid/rnn_beam_search/beam_search.py create mode 100644 fluid/rnn_beam_search/simple_seq2seq.py diff --git a/fluid/rnn_beam_search/beam_search.py b/fluid/rnn_beam_search/beam_search.py new file mode 100644 index 0000000000..503c86e901 --- /dev/null +++ b/fluid/rnn_beam_search/beam_search.py @@ -0,0 +1,205 @@ +import paddle.v2.fluid as fluid +import paddle.v2.fluid.layers as layers +from paddle.v2.fluid.layer_helper import LayerHelper, unique_name +import paddle.v2.fluid.core as core + + +class DecoderType: + TRAINING = 1 + BEAM_SEARCH = 2 + + +class BasicRNNCell(object): + def __init__(self, cell_size): + self._size = cell_size + + def inject_decoder(self, decoder_obj, **kwargs): + self._decoder_obj = decoder_obj + + if isinstance(decoder_obj, TrainingDecoder): + self._decoder_type = DecoderType.TRAINING + assert 'step_inputs' in kwargs + self._step_inputs = kwargs['step_inputs'] + assert 'init_states' in kwargs + self._init_states = kwargs['init_states'] + elif isinstance(decoder_obj, BeamSearchDecoder): + self._decoder_type = DecoderType.BEAM_SEARCH + assert 'init_states' in kwargs + self._init_states = kwargs['init_states'] + + def init_inputs(self): + if self._decoder_type == DecoderType.TRAINING: + assert len(self._step_inputs) == 1 + self._current_word = self._decoder_obj.rnn.step_input( + self._step_inputs[0]) + + def init_states(self): + if self._decoder_type == DecoderType.TRAINING: + assert len(self._init_states) == 1 + self._hidden_mem = self._decoder_obj.rnn.memory( + init=self._init_states[0]) + elif self._decoder_type == DecoderType.BEAM_SEARCH: + parent_block = self._decoder_obj.parent_block() + self._hidden_mem = parent_block.create_var( + name=unique_name('beam_search_basic_rnn_state'), + type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, + dtype=self._init_states[0].dtype) + parent_block.append_op( + type='write_to_array', + inputs={ + 'X': self._init_states[0], + 'I': self._decoder_obj.counter + }, + outputs={'Out': self._hidden_mem}) + + def compute_current_state(self, **kwargs): + if self._decoder_type == DecoderType.TRAINING: + self._current_state = layers.fc( + input=[self._current_word, self._hidden_mem], size=self._size) + elif self._decoder_type == DecoderType.BEAM_SEARCH: + prev_state = layers.array_read( + array=self._hidden_mem, i=self._decoder_obj.counter) + assert 'prev_scores' in kwargs + prev_state_expanded = layers.sequence_expand(prev_state, + kwargs['prev_scores']) + assert 'prev_ids_embedding' in kwargs + self._current_state = layers.fc( + input=[kwargs['prev_ids_embedding'], prev_state_expanded], + size=self._size, + act='tanh') + + def update_states(self): + if self._decoder_type == DecoderType.TRAINING: + self._decoder_obj.rnn.update_memory(self._hidden_mem, + self._current_state) + elif self._decoder_type == DecoderType.BEAM_SEARCH: + layers.array_write( + self._current_state, + array=self._hidden_mem, + i=self._decoder_obj.counter) + + def update_outputs(self): + # may provide a output call back + if self._decoder_type == DecoderType.TRAINING: + self.calc_scores() + self._decoder_obj.rnn.output(self._current_scores) + + def calc_scores(self): + if self._decoder_type == DecoderType.TRAINING: + self._current_scores = layers.fc(input=self._current_state, + size=self._decoder_obj.label_dim, + act='softmax') + return self._current_scores + elif self._decoder_type == DecoderType.BEAM_SEARCH: + self._current_scores = layers.fc(input=self._current_state, + size=self._decoder_obj.label_dim, + act='softmax') + return self._current_scores + + +class TrainingDecoder(object): + def __init__(self, + cell_obj, + step_inputs, + label_dim, + static_inputs=None, + init_states=None): + self.label_dim = label_dim + self._helper = LayerHelper('training_decoder', name=name) + + if not isinstance(step_inputs, list): + step_inputs = [step_inputs] + + if static_inputs is not None and not isinstance(static_inputs, list): + static_inputs = [static_inputs] + + if init_states is not None and not isinstance(init_states, list): + init_states = [init_states] + + self.rnn = layers.DynamicRNN() + cell_obj.inject_decoder( + self, step_inputs=step_inputs, init_states=init_states) + + with self.rnn.block(): + cell_obj.init_inputs() + cell_obj.init_states() + cell_obj.compute_current_state() + cell_obj.update_states() + cell_obj.update_outputs() + + def __call__(self): + return self.rnn() + + +class BeamSearchDecoder(object): + def __init__(self, + cell_obj, + init_ids, + init_scores, + init_states, + max_length, + label_dim, + eos_token, + beam_width, + embedding_layer, + name=None): + self._helper = LayerHelper('beam_search_decoder', name=name) + self.label_dim = label_dim + + if not isinstance(init_states, list): + init_states = [init_states] + + array_len = layers.fill_constant( + shape=[1], dtype='int64', value=max_length) + self.counter = layers.zeros(shape=[1], dtype='int64') + ids_array = layers.create_array('int64') + layers.array_write(init_ids, array=ids_array, i=self.counter) + scores_array = layers.create_array('float32') + layers.array_write(init_scores, array=scores_array, i=self.counter) + cond = layers.less_than(x=self.counter, y=array_len) + + cell_obj.inject_decoder(self, init_states=init_states) + + while_op = layers.While(cond=cond) + with while_op.block(): + cell_obj.init_states() + prev_ids = layers.array_read(array=ids_array, i=self.counter) + prev_scores = layers.array_read(array=scores_array, i=self.counter) + prev_ids_embedding = embedding_layer(prev_ids) + + cell_obj.compute_current_state( + prev_scores=prev_scores, prev_ids_embedding=prev_ids_embedding) + + current_scores = cell_obj.calc_scores() + + topk_scores, topk_indices = layers.topk( + current_scores, k=beam_width) + selected_ids, selected_scores = layers.beam_search( + prev_ids, + topk_indices, + topk_scores, + beam_width, + end_id=eos_token, + level=0) + + layers.increment(x=self.counter, value=1, in_place=True) + + cell_obj.update_states() + layers.array_write(selected_ids, array=ids_array, i=self.counter) + layers.array_write( + selected_scores, array=scores_array, i=self.counter) + + layers.less_than(x=self.counter, y=array_len, cond=cond) + + self._translation_ids, self._translation_scores = layers.beam_search_decode( + ids=ids_array, scores=scores_array) + + def parent_block(self): + program = self._helper.main_program + parent_block_idx = program.current_block().parent_idx + assert parent_block_idx >= 0 + parent_block = program.block(parent_block_idx) + return parent_block + + def __call__(self): + return self._translation_ids, self._translation_scores diff --git a/fluid/rnn_beam_search/simple_seq2seq.py b/fluid/rnn_beam_search/simple_seq2seq.py new file mode 100644 index 0000000000..f09eb57791 --- /dev/null +++ b/fluid/rnn_beam_search/simple_seq2seq.py @@ -0,0 +1,211 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle.v2 as paddle +import paddle.v2.fluid as fluid +import paddle.v2.fluid.core as core +import paddle.v2.fluid.framework as framework +import paddle.v2.fluid.layers as pd +from paddle.v2.fluid.executor import Executor +from beam_search import BasicRNNCell, TrainingDecoder, BeamSearchDecoder + +dict_size = 30000 +source_dict_dim = target_dict_dim = dict_size +src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) +hidden_dim = 32 +word_dim = 16 +IS_SPARSE = True +batch_size = 2 +max_length = 8 +topk_size = 50 +trg_dic_size = 10000 +beam_size = 2 + +decoder_size = hidden_dim + +place = core.CPUPlace() + + +def encoder(): + # encoder + src_word_id = pd.data( + name="src_word_id", shape=[1], dtype='int64', lod_level=1) + src_embedding = pd.embedding( + input=src_word_id, + size=[dict_size, word_dim], + dtype='float32', + is_sparse=IS_SPARSE, + param_attr=fluid.ParamAttr(name='vemb')) + + fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh') + lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4) + encoder_out = pd.sequence_last_step(input=lstm_hidden0) + return encoder_out + + +def decoder_train(context): + # decoder + trg_language_word = pd.data( + name="target_language_word", shape=[1], dtype='int64', lod_level=1) + trg_embedding = pd.embedding( + input=trg_language_word, + size=[dict_size, word_dim], + dtype='float32', + is_sparse=IS_SPARSE, + param_attr=fluid.ParamAttr(name='vemb')) + + rnn_cell = BasicRNNCell(cell_size=decoder_size) + decoder = TrainingDecoder( + rnn_cell, + step_inputs=[trg_embedding], + label_dim=target_dict_dim, + init_states=[context]) + + return decoder() + + +def decoder_decode(context): + rnn_cell = BasicRNNCell(cell_size=decoder_size) + init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2) + init_scores = pd.data( + name="init_scores", shape=[1], dtype="float32", lod_level=2) + + def embedding(input): + return pd.embedding( + input=input, + size=[dict_size, word_dim], + dtype='float32', + is_sparse=IS_SPARSE) + + decoder = BeamSearchDecoder( + cell_obj=rnn_cell, + init_ids=init_ids, + init_scores=init_scores, + init_states=context, + max_length=max_length, + label_dim=trg_dic_size, + eos_token=10, + beam_width=beam_size, + embedding_layer=embedding) + + translation_ids, translation_scores = decoder() + + return translation_ids, translation_scores + + +def set_init_lod(data, lod, place): + res = core.LoDTensor() + res.set(data, place) + res.set_lod(lod) + return res + + +def to_lodtensor(data, place): + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = np.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + res = core.LoDTensor() + res.set(flattened_data, place) + res.set_lod([lod]) + return res + + +def train_main(): + context = encoder() + rnn_out = decoder_train(context) + label = pd.data( + name="target_language_next_word", shape=[1], dtype='int64', lod_level=1) + cost = pd.cross_entropy(input=rnn_out, label=label) + avg_cost = pd.mean(x=cost) + + optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4) + optimizer.minimize(avg_cost) + + train_data = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(dict_size), buf_size=1000), + batch_size=batch_size) + + exe = Executor(place) + + exe.run(framework.default_startup_program()) + + batch_id = 0 + for pass_id in xrange(1): + for data in train_data(): + word_data = to_lodtensor(map(lambda x: x[0], data), place) + trg_word = to_lodtensor(map(lambda x: x[1], data), place) + trg_word_next = to_lodtensor(map(lambda x: x[2], data), place) + outs = exe.run(framework.default_main_program(), + feed={ + 'src_word_id': word_data, + 'target_language_word': trg_word, + 'target_language_next_word': trg_word_next + }, + fetch_list=[avg_cost]) + avg_cost_val = np.array(outs[0]) + print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) + + " avg_cost=" + str(avg_cost_val)) + if batch_id > 3: + break + batch_id += 1 + + +def decode_main(): + context = encoder() + translation_ids, translation_scores = decoder_decode(context) + + exe = Executor(place) + exe.run(framework.default_startup_program()) + + init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64') + init_scores_data = np.array( + [1. for _ in range(batch_size)], dtype='float32') + init_ids_data = init_ids_data.reshape((batch_size, 1)) + init_scores_data = init_scores_data.reshape((batch_size, 1)) + init_lod = [i for i in range(batch_size)] + [batch_size] + init_lod = [init_lod, init_lod] + + train_data = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(dict_size), buf_size=1000), + batch_size=batch_size) + for _, data in enumerate(train_data()): + init_ids = set_init_lod(init_ids_data, init_lod, place) + init_scores = set_init_lod(init_scores_data, init_lod, place) + + src_word_data = to_lodtensor(map(lambda x: x[0], data), place) + + result_ids, result_scores = exe.run( + framework.default_main_program(), + feed={ + 'src_word_id': src_word_data, + 'init_ids': init_ids, + 'init_scores': init_scores + }, + fetch_list=[translation_ids, translation_scores], + return_numpy=False) + print result_ids.lod() + break + + +if __name__ == '__main__': + #train_main() + decode_main() From 6e2b78744530db486ba32af217b4ef3107ae1f9b Mon Sep 17 00:00:00 2001 From: yangyaming Date: Thu, 8 Mar 2018 12:22:00 +0800 Subject: [PATCH 2/9] Refine api design for beam search. --- fluid/rnn_beam_search/beam_search_api.py | 195 +++++++++++++++++++++++ fluid/rnn_beam_search/simple_seq2seq.py | 35 ++-- 2 files changed, 221 insertions(+), 9 deletions(-) create mode 100644 fluid/rnn_beam_search/beam_search_api.py diff --git a/fluid/rnn_beam_search/beam_search_api.py b/fluid/rnn_beam_search/beam_search_api.py new file mode 100644 index 0000000000..12efd73cce --- /dev/null +++ b/fluid/rnn_beam_search/beam_search_api.py @@ -0,0 +1,195 @@ +import paddle.v2.fluid as fluid +import paddle.v2.fluid.layers as layers +import contextlib +from paddle.v2.fluid.layer_helper import LayerHelper, unique_name +import paddle.v2.fluid.core as core + + +class DecoderType: + TRAINING = 1 + BEAM_SEARCH = 2 + + +class InitState(object): + def __init__(self, + init=None, + shape=None, + value=0.0, + need_reorder=False, + dtype='float32'): + self._init = init + self._shape = shape + self._value = value + self._need_reorder = need_reorder + self._dtype = dtype + + @property + def value(self): + return self._init # may create a LoDTensor + + +class MemoryState(object): + def __init__(self, state_name, rnn_obj, init_state): + self._state_name = state_name # each is a rnn.memory + self._rnn_obj = rnn_obj + self._state_mem = self._rnn_obj.memory(init=init_state.value) + + def get_state(self): + return self._state_mem + + def update_state(self, state): + self._rnn_obj.update_memory(self._state_mem, state) + + +class ArrayState(object): + def __init__(self, state_name, init_state): + self._state_name = state_name + self._counter = layers.zeros(shape=[1], dtype='int64') + self._state_array = layers.create_array('int64') + # write initial state + layers.array_write( + init_state.value, + array=self._state_array, + i=self._decoder_obj.counter) + + def get_state(self): + state = layers.array_read(array=self._state_array, i=self._counter) + return state + + def update_state(self, state): + layers.increment(x=self._counter, value=1, in_place=True) + layers.array_write(state, array=self._state_array, i=self._counter) + + +class StateCell(object): + def __init__(self, cell_size, inputs, states, name=None): + self._helper = LayerHelper("state_cell", name=name) + self._cur_states = {} + self._state_names = [] + for state_name, state in states.items(): + if not isinstance(state, InitState): + raise ValueError("State must be an InitState object.") + self._cur_states[state_name] = state + self._state_names.append(state_name) + self._inputs = inputs # inputs is place holder here + self._states_holder = {} + self._cur_decoder_obj = None + + def switch_decoder(self, decoder_obj): + self._cur_decoder_obj = decoder_obj + for state_name in self._state_names: + if state_name not in self._states_holder: + state = self._cur_states[state_name] + if not isinstance(state, InitState): + raise ValueError("Current type of state is %s, should be " + "an InitState object." % type(state)) + if decoder_obj.type == DecoderType.TRAINING: + self._states_holder[state_name][decoder_obj] = \ + MemoryState(state_name, + decoder_obj.dynamic_rnn, + state) + elif decoder_obj.type == DecoderType.BEAM_SEARCH: + self._states_holder[state_name][decoder_obj] = \ + ArrayState(state_name, state) + else: + raise ValueError("Unknown decoder type, only support " + "[TRAINING, BEAM_SEARCH]") + # Read back, since current state should be LoDTensor + self._cur_states[state_name] = \ + self._states_holder[state_name][decoder_obj].get_state() + + def get_state(self, state_name): + if state_name not in self._cur_states: + raise ValueError( + 'Unknown state %s. Please make sure switch_decoder ' + 'invoked.' % state_name) + return self._cur_states[state_name] + + def get_input(self, input_name): + if input_name not in self._inputs or self._inputs[input_name] is None: + raise ValueError("Invalid input %s." % input_name) + + def set_state(self, state_name, state_value): + self._cur_states[state_name] = state_value + + def register_updater(self, state_updater): + self._state_updater = state_updater + + def compute_state(self, inputs): + for input_name, input_value in inputs.items(): + if input_name not in self._inputs: + raise ValueError('Unknown input %s. ' + 'Please make sure %s in input ' + 'place holder.' % (input_name, input_name)) + self._inputs[input_name] = input_value + + self._state_updater() + + def update_state(self): + for _, decoder_state in self._states_holder.items(): + if self._cur_decoder_obj not in decoder_state: + raise ValueError("Unknown decoder object, please make sure " + "switch_decoder been invoked.") + decoder_state[self._cur_decoder_obj].update_state(self._cur_states[ + state_name]) + + +class TrainingDecoder(object): + BEFORE_DECODER = 0 + IN_DECODER = 1 + AFTER_DECODER = 2 + + def __init__(self, state_cell, name=None): + self._helper = LayerHelper('training_decoder', name=name) + self._status = TrainingDecoder.BEFORE_DECODER + self._dynamic_rnn = layers.DynamicRNN() + self._type = DecoderType.TRAINING + self._state_cell = state_cell + + @contextlib.contextmanager + def block(self): + if self._status != TrainingDecoder.BEFORE_DECODER: + raise ValueError("decoder.block() can only be invoked once") + self._status = TrainingDecoder.IN_DECODER + with self._dynamic_rnn.block(): + self._state_cell.switch_decoder(self) + yield + self._status = TrainingDecoder.AFTER_DECODER + + @property + def state_cell(self): + self._assert_in_decoder_block("state_cell") + return self._state_cell + + @property + def dynamic_rnn(self): + return self._dynamic_rnn + + @property + def type(self): + return self._type + + def step_input(self, x): + self._assert_in_decoder_block("step_input") + return self._dynamic_rnn.step_input(x) + + def static_input(self, x): + self._assert_in_decoder_block("static_input") + return self._dynamic_rnn.static_input(x) + + def __call__(self, *args, **kwargs): + return self._dynamic_rnn(*args, **kwargs) + + def output(self, *outputs): + self._assert_in_decoder_block("output") + self._dynamic_rnn(output) + + def _assert_in_decoder_block(self, method): + if self._status != TrainingDecoder.IN_DECODER: + raise ValueError("%s should be invoked inside training " + "decoder." % method) + + +class BeamSearchDecoder(object): + def __init__(self, state_cell): + pass diff --git a/fluid/rnn_beam_search/simple_seq2seq.py b/fluid/rnn_beam_search/simple_seq2seq.py index f09eb57791..1c9809d3ed 100644 --- a/fluid/rnn_beam_search/simple_seq2seq.py +++ b/fluid/rnn_beam_search/simple_seq2seq.py @@ -19,7 +19,7 @@ import paddle.v2.fluid.framework as framework import paddle.v2.fluid.layers as pd from paddle.v2.fluid.executor import Executor -from beam_search import BasicRNNCell, TrainingDecoder, BeamSearchDecoder +from beam_search_api import * dict_size = 30000 source_dict_dim = target_dict_dim = dict_size @@ -56,6 +56,19 @@ def encoder(): def decoder_train(context): + h = InitState(init=context) + state_cell = StateCell( + cell_size=decoder_size, inputs={'x': None}, states={'h': h}) + from functools import partial + + def updater(state_cell): + current_word = state_cell.get_input('x') + prev_h = state_cell.get_state('h') + h = pd.fc(input=[current_word, prev_h], size=decoder_size, act='tanh') + state_cell.set_state('h', h) + + state_cell.register_updater(partial(updater, state_cell)) + # decoder trg_language_word = pd.data( name="target_language_word", shape=[1], dtype='int64', lod_level=1) @@ -66,12 +79,16 @@ def decoder_train(context): is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr(name='vemb')) - rnn_cell = BasicRNNCell(cell_size=decoder_size) - decoder = TrainingDecoder( - rnn_cell, - step_inputs=[trg_embedding], - label_dim=target_dict_dim, - init_states=[context]) + training_decoder = TrainingDecoder(state_cell) + + with training_decoder.block() as decoder: + current_word = decoder.step_input(trg_embedding) + decoder.state_cell.compute_state(inputs={'x': current_word}) + current_score = pd.fc(input=decoder.state_cell.state('h'), + size=target_dict_dim, + act='softmax') + decoder.state_cell.update_state() + decoder.output(current_score) return decoder() @@ -207,5 +224,5 @@ def decode_main(): if __name__ == '__main__': - #train_main() - decode_main() + train_main() + #decode_main() From a4ecb69fc81d91c34ed1edf97227a0feacad4e11 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Fri, 9 Mar 2018 13:00:58 +0800 Subject: [PATCH 3/9] Currently, simple_seq2seq can run smoothly. --- fluid/rnn_beam_search/beam_search.py | 3 + fluid/rnn_beam_search/beam_search_api.py | 280 +++++++++++++++++++---- fluid/rnn_beam_search/simple_seq2seq.py | 67 +++--- 3 files changed, 279 insertions(+), 71 deletions(-) diff --git a/fluid/rnn_beam_search/beam_search.py b/fluid/rnn_beam_search/beam_search.py index 503c86e901..133d11d74d 100644 --- a/fluid/rnn_beam_search/beam_search.py +++ b/fluid/rnn_beam_search/beam_search.py @@ -152,10 +152,13 @@ def __init__(self, array_len = layers.fill_constant( shape=[1], dtype='int64', value=max_length) self.counter = layers.zeros(shape=[1], dtype='int64') + ids_array = layers.create_array('int64') layers.array_write(init_ids, array=ids_array, i=self.counter) + scores_array = layers.create_array('float32') layers.array_write(init_scores, array=scores_array, i=self.counter) + cond = layers.less_than(x=self.counter, y=array_len) cell_obj.inject_decoder(self, init_states=init_states) diff --git a/fluid/rnn_beam_search/beam_search_api.py b/fluid/rnn_beam_search/beam_search_api.py index 12efd73cce..6d45b78a08 100644 --- a/fluid/rnn_beam_search/beam_search_api.py +++ b/fluid/rnn_beam_search/beam_search_api.py @@ -1,5 +1,6 @@ import paddle.v2.fluid as fluid import paddle.v2.fluid.layers as layers +from paddle.v2.fluid.framework import Variable import contextlib from paddle.v2.fluid.layer_helper import LayerHelper, unique_name import paddle.v2.fluid.core as core @@ -42,15 +43,40 @@ def update_state(self, state): class ArrayState(object): - def __init__(self, state_name, init_state): + def __init__(self, state_name, block, init_state): self._state_name = state_name - self._counter = layers.zeros(shape=[1], dtype='int64') - self._state_array = layers.create_array('int64') + self._block = block + + self._state_array = self._block.create_var( + name=unique_name('array_state_array'), + type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, + dtype=init_state.value.dtype) + + self._counter = self._block.create_var( + name=unique_name('array_state_counter'), + type=core.VarDesc.VarType.LOD_TENSOR, + dtype='int64') + + # initialize counter + self._block.append_op( + type='fill_constant', + inputs={}, + outputs={'Out': [self._counter]}, + attrs={ + 'shape': [1], + 'dtype': self._counter.dtype, + 'value': float(0.0), + 'force_cpu': True + }) + + self._counter.stop_gradient = True + # write initial state - layers.array_write( - init_state.value, - array=self._state_array, - i=self._decoder_obj.counter) + block.append_op( + type='write_to_array', + inputs={'X': init_state.value, + 'I': self._counter}, + outputs={'Out': self._state_array}) def get_state(self): state = layers.array_read(array=self._state_array, i=self._counter) @@ -63,51 +89,80 @@ def update_state(self, state): class StateCell(object): def __init__(self, cell_size, inputs, states, name=None): - self._helper = LayerHelper("state_cell", name=name) + self._helper = LayerHelper('state_cell', name=name) self._cur_states = {} self._state_names = [] + self._states_holder = {} for state_name, state in states.items(): if not isinstance(state, InitState): - raise ValueError("State must be an InitState object.") + raise ValueError('state must be an InitState object.') self._cur_states[state_name] = state self._state_names.append(state_name) self._inputs = inputs # inputs is place holder here - self._states_holder = {} self._cur_decoder_obj = None + self._in_decoder = False + self._states_holder = {} + self._switched_decoder = False - def switch_decoder(self, decoder_obj): + def enter_decoder(self, decoder_obj): + if self._in_decoder == True or self._cur_decoder_obj is not None: + raise ValueError('StateCell has already entered a decoder.') + self._in_decoder = True self._cur_decoder_obj = decoder_obj + self._switched_decoder = False + + def _switch_decoder(self): # lazy switch + if self._in_decoder == False: + raise ValueError('StateCell must be enter a decoder.') + + if self._switched_decoder == True: + raise ValueError('StateCell already done switching.') + for state_name in self._state_names: if state_name not in self._states_holder: state = self._cur_states[state_name] + if not isinstance(state, InitState): - raise ValueError("Current type of state is %s, should be " - "an InitState object." % type(state)) - if decoder_obj.type == DecoderType.TRAINING: - self._states_holder[state_name][decoder_obj] = \ + raise ValueError('Current type of state is %s, should be ' + 'an InitState object.' % type(state)) + + self._states_holder[state_name] = {} + + if self._cur_decoder_obj.type == DecoderType.TRAINING: + self._states_holder[state_name][id(self._cur_decoder_obj)] = \ MemoryState(state_name, - decoder_obj.dynamic_rnn, + self._cur_decoder_obj.dynamic_rnn, state) - elif decoder_obj.type == DecoderType.BEAM_SEARCH: - self._states_holder[state_name][decoder_obj] = \ - ArrayState(state_name, state) + elif self._cur_decoder_obj.type == DecoderType.BEAM_SEARCH: + self._states_holder[state_name][id(self._cur_decoder_obj)] = \ + ArrayState(state_name, + self._cur_decoder_obj.parent_block(), + state) else: - raise ValueError("Unknown decoder type, only support " - "[TRAINING, BEAM_SEARCH]") + raise ValueError('Unknown decoder type, only support ' + '[TRAINING, BEAM_SEARCH]') + # Read back, since current state should be LoDTensor self._cur_states[state_name] = \ - self._states_holder[state_name][decoder_obj].get_state() + self._states_holder[state_name][id(self._cur_decoder_obj)].get_state() + + self._switched_decoder = True def get_state(self, state_name): + if self._in_decoder and not self._switched_decoder: + self._switch_decoder() + if state_name not in self._cur_states: raise ValueError( - 'Unknown state %s. Please make sure switch_decoder ' + 'Unknown state %s. Please make sure _switch_decoder() ' 'invoked.' % state_name) + return self._cur_states[state_name] def get_input(self, input_name): if input_name not in self._inputs or self._inputs[input_name] is None: - raise ValueError("Invalid input %s." % input_name) + raise ValueError('Invalid input %s.' % input_name) + return self._inputs[input_name] def set_state(self, state_name, state_value): self._cur_states[state_name] = state_value @@ -116,6 +171,9 @@ def register_updater(self, state_updater): self._state_updater = state_updater def compute_state(self, inputs): + if self._in_decoder and not self._switched_decoder: + self._switch_decoder() + for input_name, input_value in inputs.items(): if input_name not in self._inputs: raise ValueError('Unknown input %s. ' @@ -123,15 +181,30 @@ def compute_state(self, inputs): 'place holder.' % (input_name, input_name)) self._inputs[input_name] = input_value - self._state_updater() + self._state_updater(self) + + def update_states(self): + if self._in_decoder and not self._switched_decoder: + self._switched_decoder() + + for state_name, decoder_state in self._states_holder.items(): + if id(self._cur_decoder_obj) not in decoder_state: + raise ValueError('Unknown decoder object, please make sure ' + 'switch_decoder been invoked.') + decoder_state[id(self._cur_decoder_obj)].update_state( + self._cur_states[state_name]) - def update_state(self): - for _, decoder_state in self._states_holder.items(): - if self._cur_decoder_obj not in decoder_state: - raise ValueError("Unknown decoder object, please make sure " - "switch_decoder been invoked.") - decoder_state[self._cur_decoder_obj].update_state(self._cur_states[ - state_name]) + def leave_decoder(self, decoder_obj): + if self._in_decoder == False: + raise ValueError('StateCell not in decoder, ' + 'invlid leaving operation.') + + if self._cur_decoder_obj != decoder_obj: + raise ValueError('Inconsist decoder object in StateCell.') + + self._in_decoder = False + self._cur_decoder_obj = None + self._switched_decoder = True class TrainingDecoder(object): @@ -145,20 +218,21 @@ def __init__(self, state_cell, name=None): self._dynamic_rnn = layers.DynamicRNN() self._type = DecoderType.TRAINING self._state_cell = state_cell + self._state_cell.enter_decoder(self) @contextlib.contextmanager def block(self): if self._status != TrainingDecoder.BEFORE_DECODER: - raise ValueError("decoder.block() can only be invoked once") + raise ValueError('decoder.block() can only be invoked once') self._status = TrainingDecoder.IN_DECODER with self._dynamic_rnn.block(): - self._state_cell.switch_decoder(self) yield self._status = TrainingDecoder.AFTER_DECODER + self._state_cell.leave_decoder(self) @property def state_cell(self): - self._assert_in_decoder_block("state_cell") + self._assert_in_decoder_block('state_cell') return self._state_cell @property @@ -170,26 +244,146 @@ def type(self): return self._type def step_input(self, x): - self._assert_in_decoder_block("step_input") + self._assert_in_decoder_block('step_input') return self._dynamic_rnn.step_input(x) def static_input(self, x): - self._assert_in_decoder_block("static_input") + self._assert_in_decoder_block('static_input') return self._dynamic_rnn.static_input(x) def __call__(self, *args, **kwargs): + if self._status != TrainingDecoder.AFTER_DECODER: + raise ValueError('Output of training decoder can only be visited ' + 'outside the block.') return self._dynamic_rnn(*args, **kwargs) def output(self, *outputs): - self._assert_in_decoder_block("output") - self._dynamic_rnn(output) + self._assert_in_decoder_block('output') + self._dynamic_rnn.output(*outputs) def _assert_in_decoder_block(self, method): if self._status != TrainingDecoder.IN_DECODER: - raise ValueError("%s should be invoked inside training " - "decoder." % method) + raise ValueError('%s should be invoked inside block of ' + 'TrainingDecoder object.' % method) class BeamSearchDecoder(object): - def __init__(self, state_cell): - pass + BEFORE_BEAM_SEARCH_DECODER = 0 + IN_BEAM_SEARCH_DECODER = 1 + AFTER_BEAM_SEARCH_DECODER = 2 + + def __init__(self, state_cell, max_len, name=None): + self._helper = LayerHelper('beam_search_decoder', name=name) + self._counter = layers.zeros(shape=[1], dtype='int64') + self._counter.stop_gradient = True + self._type = DecoderType.BEAM_SEARCH + self._max_len = layers.fill_constant( + shape=[1], dtype='int64', value=max_len) + self._cond = layers.less_than( + x=self._counter, + y=layers.fill_constant( + shape=[1], dtype='int64', value=max_len)) + self._while_op = layers.While(self._cond) + self._state_cell = state_cell + self._state_cell.enter_decoder(self) + self._status = BeamSearchDecoder.BEFORE_BEAM_SEARCH_DECODER + self._zero_idx = layers.fill_constant( + shape=[1], value=0, dtype='int64', force_cpu=True) + self._array_dict = {} + self._array_link = [] + self._ids_array = None + self._scores_array = None + + @contextlib.contextmanager + def block(self): + if self._status != BeamSearchDecoder.BEFORE_BEAM_SEARCH_DECODER: + raise ValueError('block() can only be invoke once.') + + self._status = BeamSearchDecoder.IN_BEAM_SEARCH_DECODER + + with self._while_op.block(): + yield + + layers.increment(x=self._counter, value=1.0, in_place=True) + + for value, array in self._array_link: + layers.array_write(x=value, i=self._counter, array=array) + + layers.less_than(x=self._counter, y=self._max_len, cond=self._cond) + + self._status = BeamSearchDecoder.AFTER_BEAM_SEARCH_DECODER + self._state_cell.leave_decoder(self) + + @property + def type(self): + return self._type + + # init must be provided + def read_array(self, init, is_ids=False, is_scores=False): + self._assert_in_decoder_block('read_array') + + if is_ids == True and is_scores == True: + raise ValueError('Shouldn\'t mark current array be ids array and' + 'scores array at the same time.') + + if not isinstance(init, Variable): + raise TypeError('The input argument `init` must be a Variable.') + + parent_block = self.parent_block() + array = parent_block.create_var( + name=unique_name('beam_search_decoder_array'), + type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, + dtype=init.dtype) + parent_block.append_op( + type='write_to_array', + inputs={'X': init, + 'I': self._zero_idx}, + outputs={'Out': array}) + + if is_ids == True: + self._ids_array = array + elif is_scores == True: + self._scores_array = array + + read_value = layers.array_read(array=array, i=self._counter) + self._array_dict[read_value.name] = array + return read_value + + def update_array(self, array, value): + self._assert_in_decoder_block('update_array') + + if not isinstance(array, Variable): + raise TypeError( + 'The input argument `array` of must be a Variable.') + if not isinstance(value, Variable): + raise TypeError('The input argument `value` of must be a Variable.') + + array = self._array_dict.get(array.name, None) + if array is None: + raise ValueError('Please invoke read_array before update_array.') + self._array_link.append((value, array)) + + def __call__(self): + if self._status != BeamSearchDecoder.AFTER_BEAM_SEARCH_DECODER: + raise ValueError('Output of BeamSearchDecoder object can ' + 'only be visited outside the block.') + return layers.beam_search_decode( + ids=self._ids_array, scores=self._scores_array) + + @property + def state_cell(self): + self._assert_in_decoder_block('state_cell') + return self._state_cell + + def parent_block(self): + program = self._helper.main_program + parent_block_idx = program.current_block().parent_idx + if parent_block_idx < 0: + raise ValueError('Invlid block with index %d.' % parent_block_idx) + parent_block = program.block(parent_block_idx) + return parent_block + + def _assert_in_decoder_block(self, method): + if self._status != BeamSearchDecoder.IN_BEAM_SEARCH_DECODER: + raise ValueError('%s should be invoked inside block of ' + 'BeamSearchDecoder object.' % method) diff --git a/fluid/rnn_beam_search/simple_seq2seq.py b/fluid/rnn_beam_search/simple_seq2seq.py index 1c9809d3ed..a323cb2511 100644 --- a/fluid/rnn_beam_search/simple_seq2seq.py +++ b/fluid/rnn_beam_search/simple_seq2seq.py @@ -55,19 +55,18 @@ def encoder(): return encoder_out +def updater(state_cell): + current_word = state_cell.get_input('x') + prev_h = state_cell.get_state('h') + h = pd.fc(input=[current_word, prev_h], size=decoder_size, act='tanh') + state_cell.set_state('h', h) + + def decoder_train(context): h = InitState(init=context) state_cell = StateCell( cell_size=decoder_size, inputs={'x': None}, states={'h': h}) - from functools import partial - - def updater(state_cell): - current_word = state_cell.get_input('x') - prev_h = state_cell.get_state('h') - h = pd.fc(input=[current_word, prev_h], size=decoder_size, act='tanh') - state_cell.set_state('h', h) - - state_cell.register_updater(partial(updater, state_cell)) + state_cell.register_updater(updater) # decoder trg_language_word = pd.data( @@ -79,22 +78,26 @@ def updater(state_cell): is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr(name='vemb')) - training_decoder = TrainingDecoder(state_cell) + decoder = TrainingDecoder(state_cell) - with training_decoder.block() as decoder: + with decoder.block(): current_word = decoder.step_input(trg_embedding) decoder.state_cell.compute_state(inputs={'x': current_word}) - current_score = pd.fc(input=decoder.state_cell.state('h'), + current_score = pd.fc(input=decoder.state_cell.get_state('h'), size=target_dict_dim, act='softmax') - decoder.state_cell.update_state() + decoder.state_cell.update_states() decoder.output(current_score) return decoder() def decoder_decode(context): - rnn_cell = BasicRNNCell(cell_size=decoder_size) + h = InitState(init=context) + state_cell = StateCell( + cell_size=decoder_size, inputs={'x': None}, states={'h': h}) + state_cell.register_updater(updater) + init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2) init_scores = pd.data( name="init_scores", shape=[1], dtype="float32", lod_level=2) @@ -106,16 +109,24 @@ def embedding(input): dtype='float32', is_sparse=IS_SPARSE) - decoder = BeamSearchDecoder( - cell_obj=rnn_cell, - init_ids=init_ids, - init_scores=init_scores, - init_states=context, - max_length=max_length, - label_dim=trg_dic_size, - eos_token=10, - beam_width=beam_size, - embedding_layer=embedding) + decoder = BeamSearchDecoder(state_cell, max_len=max_length) + + with decoder.block(): + prev_ids = decoder.read_array(init=init_ids, is_ids=True) + prev_scores = decoder.read_array(init=init_scores, is_scores=True) + prev_ids_embedding = embedding(prev_ids) + prev_state = decoder.state_cell.get_state('h') + prev_state_expanded = pd.sequence_expand(prev_state, prev_scores) + decoder.state_cell.set_state('h', prev_state_expanded) + decoder.state_cell.compute_state(inputs={'x': prev_ids_embedding}) + current_state = decoder.state_cell.get_state('h') + scores = pd.fc(input=current_state, size=target_dict_dim, act='softmax') + topk_scores, topk_indices = pd.topk(scores, k=50) + selected_ids, selected_scores = pd.beam_search( + prev_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0) + decoder.state_cell.update_states() + decoder.update_array(prev_ids, selected_ids) + decoder.update_array(prev_scores, selected_scores) translation_ids, translation_scores = decoder() @@ -180,7 +191,7 @@ def train_main(): avg_cost_val = np.array(outs[0]) print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) + " avg_cost=" + str(avg_cost_val)) - if batch_id > 3: + if batch_id > 3000: break batch_id += 1 @@ -220,9 +231,9 @@ def decode_main(): fetch_list=[translation_ids, translation_scores], return_numpy=False) print result_ids.lod() - break + #break if __name__ == '__main__': - train_main() - #decode_main() + #train_main() + decode_main() From 2b022f0b459cab323486feb2b928b001ea5c77cb Mon Sep 17 00:00:00 2001 From: yangyaming Date: Fri, 9 Mar 2018 15:14:38 +0800 Subject: [PATCH 4/9] Add attention_seq2seq.py and training can run. --- fluid/rnn_beam_search/attention_seq2seq.py | 356 +++++++++++++++++++++ fluid/rnn_beam_search/beam_search_api.py | 17 +- fluid/rnn_beam_search/simple_seq2seq.py | 24 +- 3 files changed, 379 insertions(+), 18 deletions(-) create mode 100644 fluid/rnn_beam_search/attention_seq2seq.py diff --git a/fluid/rnn_beam_search/attention_seq2seq.py b/fluid/rnn_beam_search/attention_seq2seq.py new file mode 100644 index 0000000000..8eb547befb --- /dev/null +++ b/fluid/rnn_beam_search/attention_seq2seq.py @@ -0,0 +1,356 @@ +"""seq2seq model for fluid.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import argparse +import time +import distutils.util + +import paddle.v2 +import paddle.fluid as fluid +import paddle.fluid.core as core +import paddle.fluid.framework as framework +from paddle.fluid.executor import Executor +from beam_search_api import * + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--embedding_dim", + type=int, + default=512, + help="The dimension of embedding table. (default: %(default)d)") +parser.add_argument( + "--encoder_size", + type=int, + default=512, + help="The size of encoder bi-rnn unit. (default: %(default)d)") +parser.add_argument( + "--decoder_size", + type=int, + default=512, + help="The size of decoder rnn unit. (default: %(default)d)") +parser.add_argument( + "--batch_size", + type=int, + default=16, + help="The sequence number of a mini-batch data. (default: %(default)d)") +parser.add_argument( + "--dict_size", + type=int, + default=30000, + help="The dictionary capacity. Dictionaries of source sequence and " + "target dictionary have same capacity. (default: %(default)d)") +parser.add_argument( + "--pass_num", + type=int, + default=2, + help="The pass number to train. (default: %(default)d)") +parser.add_argument( + "--learning_rate", + type=float, + default=0.0002, + help="Learning rate used to train the model. (default: %(default)f)") +parser.add_argument( + "--infer_only", action='store_true', help="If set, run forward only.") +parser.add_argument( + "--beam_size", + type=int, + default=3, + help="The width for beam searching. (default: %(default)d)") +parser.add_argument( + "--use_gpu", + type=distutils.util.strtobool, + default=False, + help="Whether to use gpu. (default: %(default)d)") +parser.add_argument( + "--max_length", + type=int, + default=250, + help="The maximum length of sequence when doing generation. " + "(default: %(default)d)") + + +def lstm_step(x_t, hidden_t_prev, cell_t_prev, size): + def linear(inputs): + return fluid.layers.fc(input=inputs, size=size, bias_attr=True) + + forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t])) + + cell_t = fluid.layers.sums(input=[ + fluid.layers.elementwise_mul( + x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul( + x=input_gate, y=cell_tilde) + ]) + + hidden_t = fluid.layers.elementwise_mul( + x=output_gate, y=fluid.layers.tanh(x=cell_t)) + + return hidden_t, cell_t + + +def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim, + target_dict_dim, is_generating, beam_size, max_length): + """Construct a seq2seq network.""" + + def bi_lstm_encoder(input_seq, gate_size): + # Linear transformation part for input gate, output gate, forget gate + # and cell activation vectors need be done outside of dynamic_lstm. + # So the output size is 4 times of gate_size. + input_forward_proj = fluid.layers.fc(input=input_seq, + size=gate_size * 4, + act=None, + bias_attr=False) + forward, _ = fluid.layers.dynamic_lstm( + input=input_forward_proj, size=gate_size * 4, use_peepholes=False) + input_reversed_proj = fluid.layers.fc(input=input_seq, + size=gate_size * 4, + act=None, + bias_attr=False) + reversed, _ = fluid.layers.dynamic_lstm( + input=input_reversed_proj, + size=gate_size * 4, + is_reverse=True, + use_peepholes=False) + return forward, reversed + + src_word_idx = fluid.layers.data( + name='source_sequence', shape=[1], dtype='int64', lod_level=1) + + src_embedding = fluid.layers.embedding( + input=src_word_idx, + size=[source_dict_dim, embedding_dim], + dtype='float32') + + src_forward, src_reversed = bi_lstm_encoder( + input_seq=src_embedding, gate_size=encoder_size) + + encoded_vector = fluid.layers.concat( + input=[src_forward, src_reversed], axis=1) + + encoded_proj = fluid.layers.fc(input=encoded_vector, + size=decoder_size, + bias_attr=False) + + backward_first = fluid.layers.sequence_pool( + input=src_reversed, pool_type='first') + + decoder_boot = fluid.layers.fc(input=backward_first, + size=decoder_size, + bias_attr=False, + act='tanh') + + cell_init = fluid.layers.fill_constant_batch_size_like( + input=decoder_boot, + value=0.0, + shape=[-1, decoder_size], + dtype='float32') + cell_init.stop_gradient = False + + h = InitState(init=decoder_boot, need_reorder=True) + c = InitState(init=cell_init) + + state_cell = StateCell( + cell_size=decoder_size, + inputs={'x': None, + 'encoder_vec': None, + 'encoder_proj': None}, + states={'h': h, + 'c': c}) + + def simple_attention(encoder_vec, encoder_proj, decoder_state): + decoder_state_proj = fluid.layers.fc(input=decoder_state, + size=decoder_size, + bias_attr=False) + decoder_state_expand = fluid.layers.sequence_expand( + x=decoder_state_proj, y=encoder_proj) + concated = fluid.layers.concat( + input=[decoder_state_expand, encoder_proj], axis=1) + attention_weights = fluid.layers.fc(input=concated, + size=1, + act='tanh', + bias_attr=False) + attention_weights = fluid.layers.sequence_softmax(x=attention_weights) + weigths_reshape = fluid.layers.reshape(x=attention_weights, shape=[-1]) + scaled = fluid.layers.elementwise_mul( + x=encoder_vec, y=weigths_reshape, axis=0) + context = fluid.layers.sequence_pool(input=scaled, pool_type='sum') + return context + + def updater(state_cell): + current_word = state_cell.get_input('x') + encoder_vec = state_cell.get_input('encoder_vec') + encoder_proj = state_cell.get_input('encoder_proj') + prev_h = state_cell.get_state('h') + prev_c = state_cell.get_state('c') + context = simple_attention(encoder_vec, encoder_proj, prev_h) + decoder_inputs = fluid.layers.concat( + input=[context, current_word], axis=1) + h, c = lstm_step(decoder_inputs, prev_h, prev_c, decoder_size) + state_cell.set_state('h', h) + state_cell.set_state('c', c) + + state_cell.register_updater(updater) + + if not is_generating: + trg_word_idx = fluid.layers.data( + name='target_sequence', shape=[1], dtype='int64', lod_level=1) + + trg_embedding = fluid.layers.embedding( + input=trg_word_idx, + size=[target_dict_dim, embedding_dim], + dtype='float32') + + decoder = TrainingDecoder(state_cell) + + with decoder.block(): + current_word = decoder.step_input(trg_embedding) + encoder_vec = decoder.static_input(encoded_vector) + encoder_proj = decoder.static_input(encoded_proj) + decoder.state_cell.compute_state(inputs={ + 'x': current_word, + 'encoder_vec': encoder_vec, + 'encoder_proj': encoder_proj + }) + h = decoder.state_cell.get_state('h') + decoder.state_cell.update_states() + out = fluid.layers.fc(input=h, + size=target_dict_dim, + bias_attr=True, + act='softmax') + decoder.output(out) + + label = fluid.layers.data( + name='label_sequence', shape=[1], dtype='int64', lod_level=1) + cost = fluid.layers.cross_entropy(input=decoder(), label=label) + avg_cost = fluid.layers.mean(x=cost) + + feeding_list = ["source_sequence", "target_sequence", "label_sequence"] + + return avg_cost, feeding_list + else: + pass + + +def to_lodtensor(data, place): + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = np.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + lod_t = core.LoDTensor() + lod_t.set(flattened_data, place) + lod_t.set_lod([lod]) + return lod_t, lod[-1] + + +def lodtensor_to_ndarray(lod_tensor): + dims = lod_tensor.get_dims() + ndarray = np.zeros(shape=dims).astype('float32') + for i in xrange(np.product(dims)): + ndarray.ravel()[i] = lod_tensor.get_float_element(i) + return ndarray + + +def train(): + avg_cost, feeding_list = seq_to_seq_net( + args.embedding_dim, + args.encoder_size, + args.decoder_size, + args.dict_size, + args.dict_size, + False, + beam_size=args.beam_size, + max_length=args.max_length) + + # clone from default main program + inference_program = fluid.default_main_program().clone() + + optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) + optimizer.minimize(avg_cost) + + fluid.memory_optimize(fluid.default_main_program(), print_log=False) + + train_batch_generator = paddle.v2.batch( + paddle.v2.reader.shuffle( + paddle.v2.dataset.wmt14.train(args.dict_size), buf_size=1000), + batch_size=args.batch_size) + + test_batch_generator = paddle.v2.batch( + paddle.v2.reader.shuffle( + paddle.v2.dataset.wmt14.test(args.dict_size), buf_size=1000), + batch_size=args.batch_size) + + place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace() + exe = Executor(place) + exe.run(framework.default_startup_program()) + + def do_validation(): + total_loss = 0.0 + count = 0 + for batch_id, data in enumerate(test_batch_generator()): + src_seq = to_lodtensor(map(lambda x: x[0], data), place)[0] + trg_seq = to_lodtensor(map(lambda x: x[1], data), place)[0] + lbl_seq = to_lodtensor(map(lambda x: x[2], data), place)[0] + + fetch_outs = exe.run(inference_program, + feed={ + feeding_list[0]: src_seq, + feeding_list[1]: trg_seq, + feeding_list[2]: lbl_seq + }, + fetch_list=[avg_cost], + return_numpy=False) + + total_loss += lodtensor_to_ndarray(fetch_outs[0])[0] + count += 1 + + return total_loss / count + + for pass_id in xrange(args.pass_num): + pass_start_time = time.time() + words_seen = 0 + for batch_id, data in enumerate(train_batch_generator()): + src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place) + words_seen += word_num + trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place) + words_seen += word_num + lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place) + + fetch_outs = exe.run(framework.default_main_program(), + feed={ + feeding_list[0]: src_seq, + feeding_list[1]: trg_seq, + feeding_list[2]: lbl_seq + }, + fetch_list=[avg_cost]) + + avg_cost_val = np.array(fetch_outs[0]) + print('pass_id=%d, batch_id=%d, train_loss: %f' % + (pass_id, batch_id, avg_cost_val)) + + pass_end_time = time.time() + test_loss = do_validation() + time_consumed = pass_end_time - pass_start_time + words_per_sec = words_seen / time_consumed + print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" % + (pass_id, test_loss, words_per_sec, time_consumed)) + + +def infer(): + pass + + +if __name__ == '__main__': + args = parser.parse_args() + if args.infer_only: + infer() + else: + train() diff --git a/fluid/rnn_beam_search/beam_search_api.py b/fluid/rnn_beam_search/beam_search_api.py index 6d45b78a08..04386ce5b8 100644 --- a/fluid/rnn_beam_search/beam_search_api.py +++ b/fluid/rnn_beam_search/beam_search_api.py @@ -1,9 +1,9 @@ -import paddle.v2.fluid as fluid -import paddle.v2.fluid.layers as layers -from paddle.v2.fluid.framework import Variable +import paddle.fluid as fluid +import paddle.fluid.layers as layers +from paddle.fluid.framework import Variable import contextlib -from paddle.v2.fluid.layer_helper import LayerHelper, unique_name -import paddle.v2.fluid.core as core +from paddle.fluid.layer_helper import LayerHelper, unique_name +import paddle.fluid.core as core class DecoderType: @@ -28,12 +28,17 @@ def __init__(self, def value(self): return self._init # may create a LoDTensor + @property + def need_reorder(self): + return self._need_reorder + class MemoryState(object): def __init__(self, state_name, rnn_obj, init_state): self._state_name = state_name # each is a rnn.memory self._rnn_obj = rnn_obj - self._state_mem = self._rnn_obj.memory(init=init_state.value) + self._state_mem = self._rnn_obj.memory( + init=init_state.value, need_reorder=init_state.need_reorder) def get_state(self): return self._state_mem diff --git a/fluid/rnn_beam_search/simple_seq2seq.py b/fluid/rnn_beam_search/simple_seq2seq.py index a323cb2511..baaafc8189 100644 --- a/fluid/rnn_beam_search/simple_seq2seq.py +++ b/fluid/rnn_beam_search/simple_seq2seq.py @@ -13,17 +13,17 @@ # limitations under the License. import numpy as np -import paddle.v2 as paddle -import paddle.v2.fluid as fluid -import paddle.v2.fluid.core as core -import paddle.v2.fluid.framework as framework -import paddle.v2.fluid.layers as pd -from paddle.v2.fluid.executor import Executor +import paddle.v2 +import paddle.fluid as fluid +import paddle.fluid.core as core +import paddle.fluid.framework as framework +import paddle.fluid.layers as pd +from paddle.fluid.executor import Executor from beam_search_api import * dict_size = 30000 source_dict_dim = target_dict_dim = dict_size -src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) +src_dict, trg_dict = paddle.v2.dataset.wmt14.get_dict(dict_size) hidden_dim = 32 word_dim = 16 IS_SPARSE = True @@ -166,9 +166,9 @@ def train_main(): optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4) optimizer.minimize(avg_cost) - train_data = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.wmt14.train(dict_size), buf_size=1000), + train_data = paddle.v2.batch( + paddle.v2.reader.shuffle( + paddle.v2.dataset.wmt14.train(dict_size), buf_size=1000), batch_size=batch_size) exe = Executor(place) @@ -235,5 +235,5 @@ def decode_main(): if __name__ == '__main__': - #train_main() - decode_main() + train_main() + #decode_main() From 51b6ecfc8d130b9c60125c1a73e27bee36c667ad Mon Sep 17 00:00:00 2001 From: yangyaming Date: Fri, 9 Mar 2018 17:24:17 +0800 Subject: [PATCH 5/9] Replace callback with decorator. --- fluid/rnn_beam_search/attention_seq2seq.py | 106 ++++++++++++++++++++- fluid/rnn_beam_search/beam_search_api.py | 13 ++- fluid/rnn_beam_search/simple_seq2seq.py | 15 ++- 3 files changed, 119 insertions(+), 15 deletions(-) diff --git a/fluid/rnn_beam_search/attention_seq2seq.py b/fluid/rnn_beam_search/attention_seq2seq.py index 8eb547befb..9eb6bc3f24 100644 --- a/fluid/rnn_beam_search/attention_seq2seq.py +++ b/fluid/rnn_beam_search/attention_seq2seq.py @@ -181,7 +181,8 @@ def simple_attention(encoder_vec, encoder_proj, decoder_state): context = fluid.layers.sequence_pool(input=scaled, pool_type='sum') return context - def updater(state_cell): + @state_cell.state_updater + def state_updater(state_cell): current_word = state_cell.get_input('x') encoder_vec = state_cell.get_input('encoder_vec') encoder_proj = state_cell.get_input('encoder_proj') @@ -194,8 +195,6 @@ def updater(state_cell): state_cell.set_state('h', h) state_cell.set_state('c', c) - state_cell.register_updater(updater) - if not is_generating: trg_word_idx = fluid.layers.data( name='target_sequence', shape=[1], dtype='int64', lod_level=1) @@ -233,7 +232,68 @@ def updater(state_cell): return avg_cost, feeding_list else: - pass + init_ids = fluid.layers.data( + name="init_ids", shape=[1], dtype="int64", lod_level=2) + init_scores = fluid.layers.data( + name="init_scores", shape=[1], dtype="float32", lod_level=2) + ''' + src_embedding = fluid.layers.embedding( + input=src_word_idx, + size=[source_dict_dim, embedding_dim], + dtype='float32') + ''' + + src_embedding = fluid.layers.embedding( + input=src_word_idx, + size=[source_dict_dim, embedding_dim], + dtype='float32', + ParamAttr=()) + + decoder = BeamSearchDecoder(state_cell, max_len=max_length) + + with decoder.block(): + # encoder_vec = prev_scores + # encoder_proj = prev_scores + prev_ids = decoder.read_array(init=init_ids, is_ids=True) + prev_scores = decoder.read_array(init=init_scores, is_scores=True) + # need make sure the weight shared + prev_ids_embedding = fluid.layers.embedding(prev_ids) + prev_h = decoder.state_cell.get_state('h') + prev_c = decoder.state_cell.get_state('c') + prev_h_expanded = fluid.layers.sequence_expand(prev_h, prev_scores) + prev_c_expanded = fluid.layers.sequence_expand(prev_c, prev_scores) + decoder.state_cell.set_state('h', prev_h_expanded) + decoder.state_cell.set_state('c', prev_c_expanded) + + decoder.state_cell.compute_state(inputs={ + 'x': prev_ids_embedding, + 'encoder_vec': None, + 'encoder_proj': None + }) + + current_state = decoder.state_cell.get_state('h') + scores = fluid.layers.fc(input=current_state, + size=target_dict_dim, + act='softmax') + topk_scores, topk_indices = fluid.layers.topk(scores, k=beam_size) + selected_ids, selected_scores = fluid.layers.beam_search( + prev_ids, + topk_indices, + topk_scores, + beam_size, + end_id=10, + level=0) + decoder.state_cell.update_states() + decoder.update_array(prev_ids, selected_ids) + decoder.update_array(prev_scores, selected_scores) + + translation_ids, translation_scores = decoder() + + feeding_list = [ + "source_sequence", "target_sequence", "init_ids", "init_scores" + ] + + return translation_ids, translation_scores, feeding_list def to_lodtensor(data, place): @@ -345,7 +405,43 @@ def do_validation(): def infer(): - pass + translation_ids, translation_scores, feeding_list = seq_to_seq_net( + args.embedding_dim, + args.encoder_size, + args.decoder_size, + args.dict_size, + args.dict_size, + True, + beam_size=args.beam_size, + max_length=args.max_length) + + fluid.memory_optimize(fluid.default_main_program(), print_log=False) + + test_batch_generator = paddle.v2.batch( + paddle.v2.reader.shuffle( + paddle.v2.dataset.wmt14.test(args.dict_size), buf_size=1000), + batch_size=args.batch_size) + + place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace() + exe = Executor(place) + exe.run(framework.default_startup_program()) + + for batch_id, data in enumerate(test_batch_generator()): + src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place) + trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place) + lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place) + + fetch_outs = exe.run(framework.default_main_program(), + feed={ + feeding_list[0]: src_seq, + feeding_list[1]: trg_seq, + feeding_list[2]: lbl_seq + }, + fetch_list=[avg_cost]) + + avg_cost_val = np.array(fetch_outs[0]) + print('pass_id=%d, batch_id=%d, train_loss: %f' % (pass_id, batch_id, + avg_cost_val)) if __name__ == '__main__': diff --git a/fluid/rnn_beam_search/beam_search_api.py b/fluid/rnn_beam_search/beam_search_api.py index 04386ce5b8..a5630a2f9d 100644 --- a/fluid/rnn_beam_search/beam_search_api.py +++ b/fluid/rnn_beam_search/beam_search_api.py @@ -108,6 +108,7 @@ def __init__(self, cell_size, inputs, states, name=None): self._in_decoder = False self._states_holder = {} self._switched_decoder = False + self._state_updater = None def enter_decoder(self, decoder_obj): if self._in_decoder == True or self._cur_decoder_obj is not None: @@ -172,8 +173,16 @@ def get_input(self, input_name): def set_state(self, state_name, state_value): self._cur_states[state_name] = state_value - def register_updater(self, state_updater): - self._state_updater = state_updater + def state_updater(self, updater): + self._state_updater = updater + + def _decorator(state_cell): + if state_cell == self: + raise TypeError('Updater should only accept a StateCell object ' + 'as argument.') + updater(state_cell) + + return _decorator def compute_state(self, inputs): if self._in_decoder and not self._switched_decoder: diff --git a/fluid/rnn_beam_search/simple_seq2seq.py b/fluid/rnn_beam_search/simple_seq2seq.py index baaafc8189..31d870277d 100644 --- a/fluid/rnn_beam_search/simple_seq2seq.py +++ b/fluid/rnn_beam_search/simple_seq2seq.py @@ -55,18 +55,17 @@ def encoder(): return encoder_out -def updater(state_cell): - current_word = state_cell.get_input('x') - prev_h = state_cell.get_state('h') - h = pd.fc(input=[current_word, prev_h], size=decoder_size, act='tanh') - state_cell.set_state('h', h) - - def decoder_train(context): h = InitState(init=context) state_cell = StateCell( cell_size=decoder_size, inputs={'x': None}, states={'h': h}) - state_cell.register_updater(updater) + + @state_cell.state_updater + def updater(state_cell): + current_word = state_cell.get_input('x') + prev_h = state_cell.get_state('h') + h = pd.fc(input=[current_word, prev_h], size=decoder_size, act='tanh') + state_cell.set_state('h', h) # decoder trg_language_word = pd.data( From abfe896e1863641de99dbc0a3f768a805012a191 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Sun, 18 Mar 2018 17:06:50 +0800 Subject: [PATCH 6/9] Make embedding table shared between training and beam search. --- fluid/rnn_beam_search/attention_seq2seq.py | 23 ++++++++++----- fluid/rnn_beam_search/beam_search_api.py | 29 ++++++++++--------- fluid/rnn_beam_search/simple_seq2seq.py | 33 ++++++++++++---------- 3 files changed, 49 insertions(+), 36 deletions(-) diff --git a/fluid/rnn_beam_search/attention_seq2seq.py b/fluid/rnn_beam_search/attention_seq2seq.py index 9eb6bc3f24..2a5fe617a1 100644 --- a/fluid/rnn_beam_search/attention_seq2seq.py +++ b/fluid/rnn_beam_search/attention_seq2seq.py @@ -124,7 +124,8 @@ def bi_lstm_encoder(input_seq, gate_size): src_embedding = fluid.layers.embedding( input=src_word_idx, size=[source_dict_dim, embedding_dim], - dtype='float32') + dtype='float32', + param_attr=fluid.ParamAttr(name='src_embedding')) src_forward, src_reversed = bi_lstm_encoder( input_seq=src_embedding, gate_size=encoder_size) @@ -202,7 +203,8 @@ def state_updater(state_cell): trg_embedding = fluid.layers.embedding( input=trg_word_idx, size=[target_dict_dim, embedding_dim], - dtype='float32') + dtype='float32', + param_attr=fluid.ParamAttr('trg_embedding')) decoder = TrainingDecoder(state_cell) @@ -242,12 +244,18 @@ def state_updater(state_cell): size=[source_dict_dim, embedding_dim], dtype='float32') ''' - - src_embedding = fluid.layers.embedding( - input=src_word_idx, - size=[source_dict_dim, embedding_dim], + fluid.layers.embedding( + input=trg_word_idx, + size=[target_dict_dim, embedding_dim], dtype='float32', - ParamAttr=()) + param_attr=fluid.ParamAttr('trg_embedding')) + + def embedding(input): + fluid.layers.embedding( + input=input, + size=[target_dict_dim, embedding_dim], + dtype='float32', + param_attr=fluid.ParamAttr('trg_embedding')) decoder = BeamSearchDecoder(state_cell, max_len=max_length) @@ -272,6 +280,7 @@ def state_updater(state_cell): }) current_state = decoder.state_cell.get_state('h') + # we can copy lod from prev_ids to current_state scores = fluid.layers.fc(input=current_state, size=target_dict_dim, act='softmax') diff --git a/fluid/rnn_beam_search/beam_search_api.py b/fluid/rnn_beam_search/beam_search_api.py index a5630a2f9d..a62ee605f9 100644 --- a/fluid/rnn_beam_search/beam_search_api.py +++ b/fluid/rnn_beam_search/beam_search_api.py @@ -1,8 +1,9 @@ +import contextlib import paddle.fluid as fluid import paddle.fluid.layers as layers from paddle.fluid.framework import Variable -import contextlib -from paddle.fluid.layer_helper import LayerHelper, unique_name +from paddle.fluid import framework, unique_name +from paddle.fluid.layer_helper import LayerHelper import paddle.fluid.core as core @@ -53,12 +54,12 @@ def __init__(self, state_name, block, init_state): self._block = block self._state_array = self._block.create_var( - name=unique_name('array_state_array'), + name=unique_name.generate('array_state_array'), type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, dtype=init_state.value.dtype) self._counter = self._block.create_var( - name=unique_name('array_state_counter'), + name=unique_name.generate('array_state_counter'), type=core.VarDesc.VarType.LOD_TENSOR, dtype='int64') @@ -135,15 +136,15 @@ def _switch_decoder(self): # lazy switch self._states_holder[state_name] = {} if self._cur_decoder_obj.type == DecoderType.TRAINING: - self._states_holder[state_name][id(self._cur_decoder_obj)] = \ - MemoryState(state_name, - self._cur_decoder_obj.dynamic_rnn, - state) + self._states_holder[state_name][id(self._cur_decoder_obj)] \ + = MemoryState(state_name, + self._cur_decoder_obj.dynamic_rnn, + state) elif self._cur_decoder_obj.type == DecoderType.BEAM_SEARCH: - self._states_holder[state_name][id(self._cur_decoder_obj)] = \ - ArrayState(state_name, - self._cur_decoder_obj.parent_block(), - state) + self._states_holder[state_name][id(self._cur_decoder_obj)] \ + = ArrayState(state_name, + self._cur_decoder_obj.parent_block(), + state) else: raise ValueError('Unknown decoder type, only support ' '[TRAINING, BEAM_SEARCH]') @@ -218,7 +219,7 @@ def leave_decoder(self, decoder_obj): self._in_decoder = False self._cur_decoder_obj = None - self._switched_decoder = True + self._switched_decoder = False class TrainingDecoder(object): @@ -345,7 +346,7 @@ def read_array(self, init, is_ids=False, is_scores=False): parent_block = self.parent_block() array = parent_block.create_var( - name=unique_name('beam_search_decoder_array'), + name=unique_name.generate('beam_search_decoder_array'), type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, dtype=init.dtype) parent_block.append_op( diff --git a/fluid/rnn_beam_search/simple_seq2seq.py b/fluid/rnn_beam_search/simple_seq2seq.py index 31d870277d..37e380ea0f 100644 --- a/fluid/rnn_beam_search/simple_seq2seq.py +++ b/fluid/rnn_beam_search/simple_seq2seq.py @@ -55,7 +55,7 @@ def encoder(): return encoder_out -def decoder_train(context): +def decoder_state_cell(context): h = InitState(init=context) state_cell = StateCell( cell_size=decoder_size, inputs={'x': None}, states={'h': h}) @@ -67,6 +67,10 @@ def updater(state_cell): h = pd.fc(input=[current_word, prev_h], size=decoder_size, act='tanh') state_cell.set_state('h', h) + return state_cell + + +def decoder_train(state_cell): # decoder trg_language_word = pd.data( name="target_language_word", shape=[1], dtype='int64', lod_level=1) @@ -91,22 +95,18 @@ def updater(state_cell): return decoder() -def decoder_decode(context): - h = InitState(init=context) - state_cell = StateCell( - cell_size=decoder_size, inputs={'x': None}, states={'h': h}) - state_cell.register_updater(updater) - +def decoder_decode(state_cell): init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2) init_scores = pd.data( name="init_scores", shape=[1], dtype="float32", lod_level=2) def embedding(input): - return pd.embedding( + pd.embedding( input=input, - size=[dict_size, word_dim], + size=[dict_dim, word_dim], dtype='float32', - is_sparse=IS_SPARSE) + is_sparse=IS_SPARSE, + param_attr=fluid.ParamAttr('vemb')) decoder = BeamSearchDecoder(state_cell, max_len=max_length) @@ -119,6 +119,7 @@ def embedding(input): decoder.state_cell.set_state('h', prev_state_expanded) decoder.state_cell.compute_state(inputs={'x': prev_ids_embedding}) current_state = decoder.state_cell.get_state('h') + # copy lod from prev_ids to current_state scores = pd.fc(input=current_state, size=target_dict_dim, act='softmax') topk_scores, topk_indices = pd.topk(scores, k=50) selected_ids, selected_scores = pd.beam_search( @@ -156,7 +157,8 @@ def to_lodtensor(data, place): def train_main(): context = encoder() - rnn_out = decoder_train(context) + state_cell = decoder_state_cell(context) + rnn_out = decoder_train(state_cell) label = pd.data( name="target_language_next_word", shape=[1], dtype='int64', lod_level=1) cost = pd.cross_entropy(input=rnn_out, label=label) @@ -197,7 +199,8 @@ def train_main(): def decode_main(): context = encoder() - translation_ids, translation_scores = decoder_decode(context) + state_cell = decoder_state_cell(context) + translation_ids, translation_scores = decoder_decode(state_cell) exe = Executor(place) exe.run(framework.default_startup_program()) @@ -210,9 +213,9 @@ def decode_main(): init_lod = [i for i in range(batch_size)] + [batch_size] init_lod = [init_lod, init_lod] - train_data = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.wmt14.train(dict_size), buf_size=1000), + train_data = paddle.v2.batch( + paddle.v2.reader.shuffle( + paddle.v2.dataset.wmt14.train(dict_size), buf_size=1000), batch_size=batch_size) for _, data in enumerate(train_data()): init_ids = set_init_lod(init_ids_data, init_lod, place) From 14c1e75ec9d0c8f3d7c4c66b3cea1a0b149e99c1 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Sun, 18 Mar 2018 17:26:48 +0800 Subject: [PATCH 7/9] Make sure lod of state kept consistent. --- fluid/rnn_beam_search/beam_search_api.py | 1 - fluid/rnn_beam_search/simple_seq2seq.py | 11 ++++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fluid/rnn_beam_search/beam_search_api.py b/fluid/rnn_beam_search/beam_search_api.py index a62ee605f9..78489d365c 100644 --- a/fluid/rnn_beam_search/beam_search_api.py +++ b/fluid/rnn_beam_search/beam_search_api.py @@ -195,7 +195,6 @@ def compute_state(self, inputs): 'Please make sure %s in input ' 'place holder.' % (input_name, input_name)) self._inputs[input_name] = input_value - self._state_updater(self) def update_states(self): diff --git a/fluid/rnn_beam_search/simple_seq2seq.py b/fluid/rnn_beam_search/simple_seq2seq.py index 37e380ea0f..eaf887890e 100644 --- a/fluid/rnn_beam_search/simple_seq2seq.py +++ b/fluid/rnn_beam_search/simple_seq2seq.py @@ -64,7 +64,8 @@ def decoder_state_cell(context): def updater(state_cell): current_word = state_cell.get_input('x') prev_h = state_cell.get_state('h') - h = pd.fc(input=[current_word, prev_h], size=decoder_size, act='tanh') + # make sure lod of h heritted from prev_h + h = pd.fc(input=[prev_h, current_word], size=decoder_size, act='tanh') state_cell.set_state('h', h) return state_cell @@ -101,9 +102,9 @@ def decoder_decode(state_cell): name="init_scores", shape=[1], dtype="float32", lod_level=2) def embedding(input): - pd.embedding( + return pd.embedding( input=input, - size=[dict_dim, word_dim], + size=[dict_size, word_dim], dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr('vemb')) @@ -237,5 +238,5 @@ def decode_main(): if __name__ == '__main__': - train_main() - #decode_main() + #train_main() + decode_main() From 683d2488ed84c62bda1e9df51d02c5d58a948a8a Mon Sep 17 00:00:00 2001 From: yangyaming Date: Tue, 20 Mar 2018 13:32:35 +0800 Subject: [PATCH 8/9] Adapt to new sequence_expand. --- fluid/rnn_beam_search/attention_seq2seq.py | 75 +++++++++++----------- fluid/rnn_beam_search/simple_seq2seq.py | 9 ++- 2 files changed, 42 insertions(+), 42 deletions(-) diff --git a/fluid/rnn_beam_search/attention_seq2seq.py b/fluid/rnn_beam_search/attention_seq2seq.py index 2a5fe617a1..c9d71bcec3 100644 --- a/fluid/rnn_beam_search/attention_seq2seq.py +++ b/fluid/rnn_beam_search/attention_seq2seq.py @@ -169,13 +169,15 @@ def simple_attention(encoder_vec, encoder_proj, decoder_state): bias_attr=False) decoder_state_expand = fluid.layers.sequence_expand( x=decoder_state_proj, y=encoder_proj) + # concated lod should inherit from encoder_proj concated = fluid.layers.concat( - input=[decoder_state_expand, encoder_proj], axis=1) + input=[encoder_proj, decoder_state_expand], axis=1) attention_weights = fluid.layers.fc(input=concated, size=1, act='tanh', bias_attr=False) - attention_weights = fluid.layers.sequence_softmax(x=attention_weights) + attention_weights = fluid.layers.sequence_softmax( + input=attention_weights) weigths_reshape = fluid.layers.reshape(x=attention_weights, shape=[-1]) scaled = fluid.layers.elementwise_mul( x=encoder_vec, y=weigths_reshape, axis=0) @@ -238,20 +240,9 @@ def state_updater(state_cell): name="init_ids", shape=[1], dtype="int64", lod_level=2) init_scores = fluid.layers.data( name="init_scores", shape=[1], dtype="float32", lod_level=2) - ''' - src_embedding = fluid.layers.embedding( - input=src_word_idx, - size=[source_dict_dim, embedding_dim], - dtype='float32') - ''' - fluid.layers.embedding( - input=trg_word_idx, - size=[target_dict_dim, embedding_dim], - dtype='float32', - param_attr=fluid.ParamAttr('trg_embedding')) def embedding(input): - fluid.layers.embedding( + return fluid.layers.embedding( input=input, size=[target_dict_dim, embedding_dim], dtype='float32', @@ -260,28 +251,30 @@ def embedding(input): decoder = BeamSearchDecoder(state_cell, max_len=max_length) with decoder.block(): - # encoder_vec = prev_scores - # encoder_proj = prev_scores + encoder_vec = decoder.read_array(init=encoded_vector) + encoder_proj = decoder.read_array(init=encoded_proj) prev_ids = decoder.read_array(init=init_ids, is_ids=True) prev_scores = decoder.read_array(init=init_scores, is_scores=True) - # need make sure the weight shared - prev_ids_embedding = fluid.layers.embedding(prev_ids) + prev_ids_embedding = embedding(prev_ids) prev_h = decoder.state_cell.get_state('h') prev_c = decoder.state_cell.get_state('c') prev_h_expanded = fluid.layers.sequence_expand(prev_h, prev_scores) prev_c_expanded = fluid.layers.sequence_expand(prev_c, prev_scores) + encoder_vec_expanded = fluid.layers.sequence_expand(encoder_vec, + prev_scores) + encoder_proj_expanded = fluid.layers.sequence_expand(encoder_proj, + prev_scores) decoder.state_cell.set_state('h', prev_h_expanded) decoder.state_cell.set_state('c', prev_c_expanded) - decoder.state_cell.compute_state(inputs={ 'x': prev_ids_embedding, - 'encoder_vec': None, - 'encoder_proj': None + 'encoder_vec': encoder_vec_expanded, + 'encoder_proj': encoder_proj_expanded }) - current_state = decoder.state_cell.get_state('h') - # we can copy lod from prev_ids to current_state - scores = fluid.layers.fc(input=current_state, + current_state_with_lod = fluid.layers.lod_reset( + x=current_state, y=prev_scores) + scores = fluid.layers.fc(input=current_state_with_lod, size=target_dict_dim, act='softmax') topk_scores, topk_indices = fluid.layers.topk(scores, k=beam_size) @@ -290,29 +283,29 @@ def embedding(input): topk_indices, topk_scores, beam_size, - end_id=10, + end_id=1, level=0) decoder.state_cell.update_states() decoder.update_array(prev_ids, selected_ids) decoder.update_array(prev_scores, selected_scores) + decoder.update_array(encoder_vec, encoder_vec_expanded) + decoder.update_array(encoder_proj, encoder_proj_expanded) translation_ids, translation_scores = decoder() - feeding_list = [ - "source_sequence", "target_sequence", "init_ids", "init_scores" - ] + feeding_list = ["source_sequence", "init_ids", "init_scores"] return translation_ids, translation_scores, feeding_list -def to_lodtensor(data, place): +def to_lodtensor(data, place, dtype='int64'): seq_lens = [len(seq) for seq in data] cur_len = 0 lod = [cur_len] for l in seq_lens: cur_len += l lod.append(cur_len) - flattened_data = np.concatenate(data, axis=0).astype("int64") + flattened_data = np.concatenate(data, axis=0).astype(dtype) flattened_data = flattened_data.reshape([len(flattened_data), 1]) lod_t = core.LoDTensor() lod_t.set(flattened_data, place) @@ -436,21 +429,25 @@ def infer(): exe.run(framework.default_startup_program()) for batch_id, data in enumerate(test_batch_generator()): - src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place) - trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place) - lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place) + batch_size = len(data) + src_seq, _ = to_lodtensor(map(lambda x: x[0], data), place) + init_ids, _ = to_lodtensor([[0] for _ in xrange(batch_size)], place) + init_ids.set_lod(init_ids.lod() + [init_ids.lod()[-1]]) + init_scores, _ = to_lodtensor([[1.0] for _ in xrange(batch_size)], + place, 'float32') + init_scores.set_lod(init_scores.lod() + [init_scores.lod()[-1]]) fetch_outs = exe.run(framework.default_main_program(), feed={ feeding_list[0]: src_seq, - feeding_list[1]: trg_seq, - feeding_list[2]: lbl_seq + feeding_list[1]: init_ids, + feeding_list[2]: init_scores }, - fetch_list=[avg_cost]) + fetch_list=[translation_ids, translation_scores], + return_numpy=False) - avg_cost_val = np.array(fetch_outs[0]) - print('pass_id=%d, batch_id=%d, train_loss: %f' % (pass_id, batch_id, - avg_cost_val)) + print(fetch_outs[0].lod()) + break if __name__ == '__main__': diff --git a/fluid/rnn_beam_search/simple_seq2seq.py b/fluid/rnn_beam_search/simple_seq2seq.py index eaf887890e..c83922e609 100644 --- a/fluid/rnn_beam_search/simple_seq2seq.py +++ b/fluid/rnn_beam_search/simple_seq2seq.py @@ -120,11 +120,14 @@ def embedding(input): decoder.state_cell.set_state('h', prev_state_expanded) decoder.state_cell.compute_state(inputs={'x': prev_ids_embedding}) current_state = decoder.state_cell.get_state('h') + current_state_with_lod = pd.lod_reset(x=current_state, y=prev_scores) # copy lod from prev_ids to current_state - scores = pd.fc(input=current_state, size=target_dict_dim, act='softmax') + scores = pd.fc(input=current_state_with_lod, + size=target_dict_dim, + act='softmax') topk_scores, topk_indices = pd.topk(scores, k=50) selected_ids, selected_scores = pd.beam_search( - prev_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0) + prev_ids, topk_indices, topk_scores, beam_size, end_id=1, level=0) decoder.state_cell.update_states() decoder.update_array(prev_ids, selected_ids) decoder.update_array(prev_scores, selected_scores) @@ -206,7 +209,7 @@ def decode_main(): exe = Executor(place) exe.run(framework.default_startup_program()) - init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64') + init_ids_data = np.array([0 for _ in range(batch_size)], dtype='int64') init_scores_data = np.array( [1. for _ in range(batch_size)], dtype='float32') init_ids_data = init_ids_data.reshape((batch_size, 1)) From 61e1d00b8e42a86a39fe170e28c239c8d6301e3e Mon Sep 17 00:00:00 2001 From: yangyaming Date: Tue, 20 Mar 2018 13:38:10 +0800 Subject: [PATCH 9/9] Remove old beam search api. --- fluid/rnn_beam_search/beam_search.py | 208 --------------------------- 1 file changed, 208 deletions(-) delete mode 100644 fluid/rnn_beam_search/beam_search.py diff --git a/fluid/rnn_beam_search/beam_search.py b/fluid/rnn_beam_search/beam_search.py deleted file mode 100644 index 133d11d74d..0000000000 --- a/fluid/rnn_beam_search/beam_search.py +++ /dev/null @@ -1,208 +0,0 @@ -import paddle.v2.fluid as fluid -import paddle.v2.fluid.layers as layers -from paddle.v2.fluid.layer_helper import LayerHelper, unique_name -import paddle.v2.fluid.core as core - - -class DecoderType: - TRAINING = 1 - BEAM_SEARCH = 2 - - -class BasicRNNCell(object): - def __init__(self, cell_size): - self._size = cell_size - - def inject_decoder(self, decoder_obj, **kwargs): - self._decoder_obj = decoder_obj - - if isinstance(decoder_obj, TrainingDecoder): - self._decoder_type = DecoderType.TRAINING - assert 'step_inputs' in kwargs - self._step_inputs = kwargs['step_inputs'] - assert 'init_states' in kwargs - self._init_states = kwargs['init_states'] - elif isinstance(decoder_obj, BeamSearchDecoder): - self._decoder_type = DecoderType.BEAM_SEARCH - assert 'init_states' in kwargs - self._init_states = kwargs['init_states'] - - def init_inputs(self): - if self._decoder_type == DecoderType.TRAINING: - assert len(self._step_inputs) == 1 - self._current_word = self._decoder_obj.rnn.step_input( - self._step_inputs[0]) - - def init_states(self): - if self._decoder_type == DecoderType.TRAINING: - assert len(self._init_states) == 1 - self._hidden_mem = self._decoder_obj.rnn.memory( - init=self._init_states[0]) - elif self._decoder_type == DecoderType.BEAM_SEARCH: - parent_block = self._decoder_obj.parent_block() - self._hidden_mem = parent_block.create_var( - name=unique_name('beam_search_basic_rnn_state'), - type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, - dtype=self._init_states[0].dtype) - parent_block.append_op( - type='write_to_array', - inputs={ - 'X': self._init_states[0], - 'I': self._decoder_obj.counter - }, - outputs={'Out': self._hidden_mem}) - - def compute_current_state(self, **kwargs): - if self._decoder_type == DecoderType.TRAINING: - self._current_state = layers.fc( - input=[self._current_word, self._hidden_mem], size=self._size) - elif self._decoder_type == DecoderType.BEAM_SEARCH: - prev_state = layers.array_read( - array=self._hidden_mem, i=self._decoder_obj.counter) - assert 'prev_scores' in kwargs - prev_state_expanded = layers.sequence_expand(prev_state, - kwargs['prev_scores']) - assert 'prev_ids_embedding' in kwargs - self._current_state = layers.fc( - input=[kwargs['prev_ids_embedding'], prev_state_expanded], - size=self._size, - act='tanh') - - def update_states(self): - if self._decoder_type == DecoderType.TRAINING: - self._decoder_obj.rnn.update_memory(self._hidden_mem, - self._current_state) - elif self._decoder_type == DecoderType.BEAM_SEARCH: - layers.array_write( - self._current_state, - array=self._hidden_mem, - i=self._decoder_obj.counter) - - def update_outputs(self): - # may provide a output call back - if self._decoder_type == DecoderType.TRAINING: - self.calc_scores() - self._decoder_obj.rnn.output(self._current_scores) - - def calc_scores(self): - if self._decoder_type == DecoderType.TRAINING: - self._current_scores = layers.fc(input=self._current_state, - size=self._decoder_obj.label_dim, - act='softmax') - return self._current_scores - elif self._decoder_type == DecoderType.BEAM_SEARCH: - self._current_scores = layers.fc(input=self._current_state, - size=self._decoder_obj.label_dim, - act='softmax') - return self._current_scores - - -class TrainingDecoder(object): - def __init__(self, - cell_obj, - step_inputs, - label_dim, - static_inputs=None, - init_states=None): - self.label_dim = label_dim - self._helper = LayerHelper('training_decoder', name=name) - - if not isinstance(step_inputs, list): - step_inputs = [step_inputs] - - if static_inputs is not None and not isinstance(static_inputs, list): - static_inputs = [static_inputs] - - if init_states is not None and not isinstance(init_states, list): - init_states = [init_states] - - self.rnn = layers.DynamicRNN() - cell_obj.inject_decoder( - self, step_inputs=step_inputs, init_states=init_states) - - with self.rnn.block(): - cell_obj.init_inputs() - cell_obj.init_states() - cell_obj.compute_current_state() - cell_obj.update_states() - cell_obj.update_outputs() - - def __call__(self): - return self.rnn() - - -class BeamSearchDecoder(object): - def __init__(self, - cell_obj, - init_ids, - init_scores, - init_states, - max_length, - label_dim, - eos_token, - beam_width, - embedding_layer, - name=None): - self._helper = LayerHelper('beam_search_decoder', name=name) - self.label_dim = label_dim - - if not isinstance(init_states, list): - init_states = [init_states] - - array_len = layers.fill_constant( - shape=[1], dtype='int64', value=max_length) - self.counter = layers.zeros(shape=[1], dtype='int64') - - ids_array = layers.create_array('int64') - layers.array_write(init_ids, array=ids_array, i=self.counter) - - scores_array = layers.create_array('float32') - layers.array_write(init_scores, array=scores_array, i=self.counter) - - cond = layers.less_than(x=self.counter, y=array_len) - - cell_obj.inject_decoder(self, init_states=init_states) - - while_op = layers.While(cond=cond) - with while_op.block(): - cell_obj.init_states() - prev_ids = layers.array_read(array=ids_array, i=self.counter) - prev_scores = layers.array_read(array=scores_array, i=self.counter) - prev_ids_embedding = embedding_layer(prev_ids) - - cell_obj.compute_current_state( - prev_scores=prev_scores, prev_ids_embedding=prev_ids_embedding) - - current_scores = cell_obj.calc_scores() - - topk_scores, topk_indices = layers.topk( - current_scores, k=beam_width) - selected_ids, selected_scores = layers.beam_search( - prev_ids, - topk_indices, - topk_scores, - beam_width, - end_id=eos_token, - level=0) - - layers.increment(x=self.counter, value=1, in_place=True) - - cell_obj.update_states() - layers.array_write(selected_ids, array=ids_array, i=self.counter) - layers.array_write( - selected_scores, array=scores_array, i=self.counter) - - layers.less_than(x=self.counter, y=array_len, cond=cond) - - self._translation_ids, self._translation_scores = layers.beam_search_decode( - ids=ids_array, scores=scores_array) - - def parent_block(self): - program = self._helper.main_program - parent_block_idx = program.current_block().parent_idx - assert parent_block_idx >= 0 - parent_block = program.block(parent_block_idx) - return parent_block - - def __call__(self): - return self._translation_ids, self._translation_scores