From ce1ea0fb02edac6ad1149df728c57014b70c7c4b Mon Sep 17 00:00:00 2001 From: yangyaming Date: Wed, 20 Dec 2017 16:01:53 +0800 Subject: [PATCH 1/4] Add seq2seq model for fluid. --- fluid/machine_translation.py | 261 +++++++++++++++++++++++++++++++++++ 1 file changed, 261 insertions(+) create mode 100644 fluid/machine_translation.py diff --git a/fluid/machine_translation.py b/fluid/machine_translation.py new file mode 100644 index 0000000..e5d4284 --- /dev/null +++ b/fluid/machine_translation.py @@ -0,0 +1,261 @@ +"""seq2seq model for fluid.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import argparse +import time +import distutils.util + +import paddle.v2 as paddle +import paddle.v2.fluid as fluid +import paddle.v2.fluid.core as core +import paddle.v2.fluid.framework as framework +from paddle.v2.fluid.param_attr import ParamAttr +from paddle.v2.fluid.executor import Executor + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--word_vector_dim", + type=int, + default=512, + help="The dimension of embedding table. (default: %(default)d)") +parser.add_argument( + "--encoder_size", + type=int, + default=512, + help="The size of encoder bi-rnn unit. (default: %(default)d)") +parser.add_argument( + "--decoder_size", + type=int, + default=512, + help="The size of decoder rnn unit. (default: %(default)d)") +parser.add_argument( + "--batch_size", + type=int, + default=4, + help="The sequence number of a batch data. (default: %(default)d)") +parser.add_argument( + "--dict_size", + type=int, + default=30000, + help="The dictionary capacity. Dictionaries of source sequence and " + "target dictionary have same capacity. (default: %(default)d)") +parser.add_argument( + "--pass_number", + type=int, + default=2, + help="The pass number to train. (default: %(default)d)") +parser.add_argument( + "--mode", + type=str, + default='train', + choices=['train', 'infer'], + help="Do training or inference. (default: %(default)s)") +parser.add_argument( + "--beam_size", + type=int, + default=3, + help="The width for beam searching. (default: %(default)d)") +parser.add_argument( + "--use_gpu", + type=distutils.util.strtobool, + default=True, + help="Whether use gpu. (default: %(default)d)") +parser.add_argument( + "--max_length", + type=int, + default=250, + help="The max length of sequence when doing generation. " + "(default: %(default)d)") + + +def seq_to_seq_net(word_vector_dim, + encoder_size, + decoder_size, + source_dict_dim, + target_dict_dim, + is_generating=False, + beam_size=3, + max_length=250): + """Construct a seq2seq network.""" + feeding_list = ["source_sequence", "target_sequence", "label_sequence"] + + def bi_lstm_encoder(input_seq, size): + input_forward_proj = fluid.layers.fc(input=input_seq, + size=size * 4, + act='tanh') + forward, _ = fluid.layers.dynamic_lstm( + input=input_forward_proj, size=size * 4) + input_reversed_proj = fluid.layers.fc(input=input_seq, + size=size * 4, + act='tanh') + reversed, _ = fluid.layers.dynamic_lstm( + input=input_reversed_proj, size=size * 4, is_reverse=True) + return forward, reversed + + src_word_idx = fluid.layers.data( + name=feeding_list[0], shape=[1], dtype='int64', lod_level=1) + + src_embedding = fluid.layers.embedding( + input=src_word_idx, + size=[source_dict_dim, word_vector_dim], + dtype='float32') + + src_forward, src_reversed = bi_lstm_encoder( + input_seq=src_embedding, size=encoder_size) + + encoded_vector = fluid.layers.concat( + input=[src_forward, src_reversed], axis=1) + + encoded_proj = fluid.layers.fc(input=encoded_vector, + size=decoder_size, + bias_attr=False) + + backward_first = fluid.layers.sequence_pool( + input=src_reversed, pool_type='first') + + decoder_boot = fluid.layers.fc(input=backward_first, + size=decoder_size, + bias_attr=False, + act='tanh') + + def lstm_decoder_with_attention(target_embedding, encoder_vec, encoder_proj, + decoder_boot, decoder_size): + def simple_attention(encoder_vec, encoder_proj, decoder_state): + decoder_state_proj = fluid.layers.fc(input=decoder_state, + size=decoder_size) + decoder_state_expand = fluid.layers.sequence_expand( + x=decoder_state_proj, y=encoder_proj) + concated = fluid.layers.concat( + input=[decoder_state_expand, encoder_proj], axis=1) + attention_weights = fluid.layers.fc(input=concated, + size=1, + bias_attr=False) + attention_weights = fluid.layers.sequence_softmax( + x=attention_weights) + weigths_reshape = fluid.layers.reshape( + x=attention_weights, shape=[-1]) + scaled = fluid.layers.elementwise_mul( + x=encoder_vec, y=weigths_reshape, axis=0) + context = fluid.layers.sequence_pool(input=scaled, pool_type='sum') + return context + + rnn = fluid.layers.DynamicRNN() + + cell_init = fluid.layers.fill_constant_batch_size_like( + input=decoder_boot, + value=0.0, + shape=[-1, decoder_size], + dtype='float32') + cell_init.stop_gradient = False + + with rnn.block(): + current_word = rnn.step_input(target_embedding) + hidden_mem = rnn.memory(init=decoder_boot) + cell_mem = rnn.memory(init=cell_init) + context = simple_attention(encoder_vec, encoder_proj, hidden_mem) + decoder_inputs = fluid.layers.concat( + input=[context, current_word], axis=1) + h, c = fluid.layers.lstm_unit( + x_t=decoder_inputs, + hidden_t_prev=hidden_mem, + cell_t_prev=cell_mem) + rnn.update_memory(hidden_mem, h) + rnn.update_memory(cell_mem, c) + out = fluid.layers.fc(input=h, + size=target_dict_dim, + bias_attr=ParamAttr(), + act='softmax') + rnn.output(out) + + return rnn() + + if not is_generating: + trg_word_idx = fluid.layers.data( + name=feeding_list[1], shape=[1], dtype='int64', lod_level=1) + + trg_embedding = fluid.layers.embedding( + input=trg_word_idx, + size=[target_dict_dim, word_vector_dim], + dtype='float32') + + prediction = lstm_decoder_with_attention(trg_embedding, encoded_vector, + encoded_proj, decoder_boot, + decoder_size) + + label = fluid.layers.data( + name=feeding_list[2], shape=[1], dtype='int64', lod_level=1) + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + + return avg_cost, feeding_list + + +def to_lodtensor(data, place): + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = np.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + lod_t = core.LoDTensor() + lod_t.set(flattened_data, place) + lod_t.set_lod([lod]) + return lod_t + + +def train(): + avg_cost, feeding_list = seq_to_seq_net( + args.word_vector_dim, + args.encoder_size, + args.decoder_size, + args.dict_size, + args.dict_size, + False, + beam_size=args.beam_size, + max_length=args.max_length) + + optimizer = fluid.optimizer.Adam(learning_rate=5e-5) + optimizer.minimize(avg_cost) + + train_batch_generator = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(args.dict_size), buf_size=1000), + batch_size=args.batch_size) + + place = core.GPUPlace() if args.use_gpu else core.CPUPlace() + exe = Executor(place) + exe.run(framework.default_startup_program()) + + for pass_id in xrange(args.pass_number): + for batch_id, data in enumerate(train_batch_generator()): + src_seq = to_lodtensor(map(lambda x: x[0], data), place) + trg_seq = to_lodtensor(map(lambda x: x[1], data), place) + lbl_seq = to_lodtensor(map(lambda x: x[2], data), place) + + fetch_outs = exe.run( + framework.default_main_program(), + feed=dict(zip(*[feeding_list, (src_seq, trg_seq, lbl_seq)])), + fetch_list=[avg_cost]) + + avg_cost_val = np.array(fetch_outs[0]) + + print('pass_id=%d, batch=%d, avg_cost=%f' % + (pass_id, batch_id, avg_cost_val)) + + +def infer(): + pass + + +if __name__ == '__main__': + args = parser.parse_args() + if args.mode == 'train': + train() + else: + infer() From ba2ea17f7c0880920b9c64220900e026c1ec66ca Mon Sep 17 00:00:00 2001 From: yangyaming Date: Mon, 15 Jan 2018 22:40:16 +0800 Subject: [PATCH 2/4] Complete seq2seq of fluid. --- fluid/machine_translation.py | 100 +++++++++++++++++++++++++++++------ 1 file changed, 84 insertions(+), 16 deletions(-) diff --git a/fluid/machine_translation.py b/fluid/machine_translation.py index e5d4284..a91d15e 100644 --- a/fluid/machine_translation.py +++ b/fluid/machine_translation.py @@ -34,7 +34,7 @@ parser.add_argument( "--batch_size", type=int, - default=4, + default=16, help="The sequence number of a batch data. (default: %(default)d)") parser.add_argument( "--dict_size", @@ -47,6 +47,11 @@ type=int, default=2, help="The pass number to train. (default: %(default)d)") +parser.add_argument( + "--learning_rate", + type=float, + default=0.0002, + help="Learning rate used to train the model. (default: %(default)f)") parser.add_argument( "--mode", type=str, @@ -71,6 +76,27 @@ "(default: %(default)d)") +def lstm_step(x_t, hidden_t_prev, cell_t_prev, size): + def linear(inputs): + return fluid.layers.fc(input=inputs, size=size, bias_attr=True) + + forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t])) + + cell_t = fluid.layers.sums(input=[ + fluid.layers.elementwise_mul( + x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul( + x=input_gate, y=cell_tilde) + ]) + + hidden_t = fluid.layers.elementwise_mul( + x=output_gate, y=fluid.layers.tanh(x=cell_t)) + + return hidden_t, cell_t + + def seq_to_seq_net(word_vector_dim, encoder_size, decoder_size, @@ -153,15 +179,14 @@ def simple_attention(encoder_vec, encoder_proj, decoder_state): with rnn.block(): current_word = rnn.step_input(target_embedding) - hidden_mem = rnn.memory(init=decoder_boot) + encoder_vec = rnn.static_input(encoder_vec) + encoder_proj = rnn.static_input(encoder_proj) + hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True) cell_mem = rnn.memory(init=cell_init) context = simple_attention(encoder_vec, encoder_proj, hidden_mem) decoder_inputs = fluid.layers.concat( input=[context, current_word], axis=1) - h, c = fluid.layers.lstm_unit( - x_t=decoder_inputs, - hidden_t_prev=hidden_mem, - cell_t_prev=cell_mem) + h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size) rnn.update_memory(hidden_mem, h) rnn.update_memory(cell_mem, c) out = fluid.layers.fc(input=h, @@ -169,7 +194,6 @@ def simple_attention(encoder_vec, encoder_proj, decoder_state): bias_attr=ParamAttr(), act='softmax') rnn.output(out) - return rnn() if not is_generating: @@ -187,7 +211,6 @@ def simple_attention(encoder_vec, encoder_proj, decoder_state): label = fluid.layers.data( name=feeding_list[2], shape=[1], dtype='int64', lod_level=1) - cost = fluid.layers.cross_entropy(input=prediction, label=label) avg_cost = fluid.layers.mean(x=cost) @@ -206,7 +229,15 @@ def to_lodtensor(data, place): lod_t = core.LoDTensor() lod_t.set(flattened_data, place) lod_t.set_lod([lod]) - return lod_t + return lod_t, lod[-1] + + +def lodtensor_to_ndarray(lod_tensor): + dims = lod_tensor.get_dims() + ndarray = np.zeros(shape=dims).astype('float32') + for i in xrange(np.product(dims)): + ndarray.ravel()[i] = lod_tensor.get_float_element(i) + return ndarray def train(): @@ -220,7 +251,10 @@ def train(): beam_size=args.beam_size, max_length=args.max_length) - optimizer = fluid.optimizer.Adam(learning_rate=5e-5) + # clone from default main program + inference_program = fluid.default_main_program().clone() + + optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) optimizer.minimize(avg_cost) train_batch_generator = paddle.batch( @@ -228,15 +262,43 @@ def train(): paddle.dataset.wmt14.train(args.dict_size), buf_size=1000), batch_size=args.batch_size) - place = core.GPUPlace() if args.use_gpu else core.CPUPlace() + test_batch_generator = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.test(args.dict_size), buf_size=1000), + batch_size=args.batch_size) + + place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace() exe = Executor(place) exe.run(framework.default_startup_program()) + def do_validation(): + total_loss = 0.0 + count = 0 + for batch_id, data in enumerate(test_batch_generator()): + src_seq = to_lodtensor(map(lambda x: x[0], data), place)[0] + trg_seq = to_lodtensor(map(lambda x: x[1], data), place)[0] + lbl_seq = to_lodtensor(map(lambda x: x[2], data), place)[0] + + fetch_outs = exe.run( + inference_program, + feed=dict(zip(*[feeding_list, (src_seq, trg_seq, lbl_seq)])), + fetch_list=[avg_cost], + return_numpy=False) + + total_loss += lodtensor_to_ndarray(fetch_outs[0])[0] + count += 1 + + return total_loss / count + for pass_id in xrange(args.pass_number): + pass_start_time = time.time() + words_seen = 0 for batch_id, data in enumerate(train_batch_generator()): - src_seq = to_lodtensor(map(lambda x: x[0], data), place) - trg_seq = to_lodtensor(map(lambda x: x[1], data), place) - lbl_seq = to_lodtensor(map(lambda x: x[2], data), place) + src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place) + words_seen += word_num + trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place) + words_seen += word_num + lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place) fetch_outs = exe.run( framework.default_main_program(), @@ -244,10 +306,16 @@ def train(): fetch_list=[avg_cost]) avg_cost_val = np.array(fetch_outs[0]) - - print('pass_id=%d, batch=%d, avg_cost=%f' % + print('pass_id=%d, batch_id=%d, train_loss: %f' % (pass_id, batch_id, avg_cost_val)) + pass_end_time = time.time() + test_loss = do_validation() + time_consumed = pass_end_time - pass_start_time + words_per_sec = words_seen / time_consumed + print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" % + (pass_id, test_loss, words_per_sec, time_consumed)) + def infer(): pass From d5b7bb7a843f0c807ea619e4ea5dba22973260ad Mon Sep 17 00:00:00 2001 From: yangyaming Date: Tue, 16 Jan 2018 17:17:35 +0800 Subject: [PATCH 3/4] Refine the script. --- fluid/machine_translation.py | 68 +++++++++++++++++------------------- 1 file changed, 32 insertions(+), 36 deletions(-) diff --git a/fluid/machine_translation.py b/fluid/machine_translation.py index a91d15e..b1f3e30 100644 --- a/fluid/machine_translation.py +++ b/fluid/machine_translation.py @@ -12,12 +12,11 @@ import paddle.v2.fluid as fluid import paddle.v2.fluid.core as core import paddle.v2.fluid.framework as framework -from paddle.v2.fluid.param_attr import ParamAttr from paddle.v2.fluid.executor import Executor parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( - "--word_vector_dim", + "--embedding_dim", type=int, default=512, help="The dimension of embedding table. (default: %(default)d)") @@ -35,7 +34,7 @@ "--batch_size", type=int, default=16, - help="The sequence number of a batch data. (default: %(default)d)") + help="The sequence number of a mini-batch data. (default: %(default)d)") parser.add_argument( "--dict_size", type=int, @@ -43,7 +42,7 @@ help="The dictionary capacity. Dictionaries of source sequence and " "target dictionary have same capacity. (default: %(default)d)") parser.add_argument( - "--pass_number", + "--pass_num", type=int, default=2, help="The pass number to train. (default: %(default)d)") @@ -53,11 +52,7 @@ default=0.0002, help="Learning rate used to train the model. (default: %(default)f)") parser.add_argument( - "--mode", - type=str, - default='train', - choices=['train', 'infer'], - help="Do training or inference. (default: %(default)s)") + "--infer_only", action='store_true', help="If set, run forward only.") parser.add_argument( "--beam_size", type=int, @@ -67,12 +62,12 @@ "--use_gpu", type=distutils.util.strtobool, default=True, - help="Whether use gpu. (default: %(default)d)") + help="Whether to use gpu. (default: %(default)d)") parser.add_argument( "--max_length", type=int, default=250, - help="The max length of sequence when doing generation. " + help="The maximum length of sequence when doing generation. " "(default: %(default)d)") @@ -97,28 +92,27 @@ def linear(inputs): return hidden_t, cell_t -def seq_to_seq_net(word_vector_dim, - encoder_size, - decoder_size, - source_dict_dim, - target_dict_dim, - is_generating=False, - beam_size=3, - max_length=250): +def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim, + target_dict_dim, is_generating, beam_size, max_length): """Construct a seq2seq network.""" feeding_list = ["source_sequence", "target_sequence", "label_sequence"] - def bi_lstm_encoder(input_seq, size): + def bi_lstm_encoder(input_seq, gate_size): + # Linear transformation part for input gate, output gate, forget gate + # and cell activation vectors need be done outside of dynamic_lstm. + # So the output size is 4 times of gate_size. input_forward_proj = fluid.layers.fc(input=input_seq, - size=size * 4, - act='tanh') + size=gate_size * 4, + act='tanh', + bias_attr=True) forward, _ = fluid.layers.dynamic_lstm( - input=input_forward_proj, size=size * 4) + input=input_forward_proj, size=gate_size * 4) input_reversed_proj = fluid.layers.fc(input=input_seq, - size=size * 4, - act='tanh') + size=gate_size * 4, + act='tanh', + bias_attr=True) reversed, _ = fluid.layers.dynamic_lstm( - input=input_reversed_proj, size=size * 4, is_reverse=True) + input=input_reversed_proj, size=gate_size * 4, is_reverse=True) return forward, reversed src_word_idx = fluid.layers.data( @@ -126,11 +120,11 @@ def bi_lstm_encoder(input_seq, size): src_embedding = fluid.layers.embedding( input=src_word_idx, - size=[source_dict_dim, word_vector_dim], + size=[source_dict_dim, embedding_dim], dtype='float32') src_forward, src_reversed = bi_lstm_encoder( - input_seq=src_embedding, size=encoder_size) + input_seq=src_embedding, gate_size=encoder_size) encoded_vector = fluid.layers.concat( input=[src_forward, src_reversed], axis=1) @@ -151,13 +145,15 @@ def lstm_decoder_with_attention(target_embedding, encoder_vec, encoder_proj, decoder_boot, decoder_size): def simple_attention(encoder_vec, encoder_proj, decoder_state): decoder_state_proj = fluid.layers.fc(input=decoder_state, - size=decoder_size) + size=decoder_size, + bias_attr=False) decoder_state_expand = fluid.layers.sequence_expand( x=decoder_state_proj, y=encoder_proj) concated = fluid.layers.concat( input=[decoder_state_expand, encoder_proj], axis=1) attention_weights = fluid.layers.fc(input=concated, size=1, + act='tanh', bias_attr=False) attention_weights = fluid.layers.sequence_softmax( x=attention_weights) @@ -191,7 +187,7 @@ def simple_attention(encoder_vec, encoder_proj, decoder_state): rnn.update_memory(cell_mem, c) out = fluid.layers.fc(input=h, size=target_dict_dim, - bias_attr=ParamAttr(), + bias_attr=True, act='softmax') rnn.output(out) return rnn() @@ -202,7 +198,7 @@ def simple_attention(encoder_vec, encoder_proj, decoder_state): trg_embedding = fluid.layers.embedding( input=trg_word_idx, - size=[target_dict_dim, word_vector_dim], + size=[target_dict_dim, embedding_dim], dtype='float32') prediction = lstm_decoder_with_attention(trg_embedding, encoded_vector, @@ -242,7 +238,7 @@ def lodtensor_to_ndarray(lod_tensor): def train(): avg_cost, feeding_list = seq_to_seq_net( - args.word_vector_dim, + args.embedding_dim, args.encoder_size, args.decoder_size, args.dict_size, @@ -290,7 +286,7 @@ def do_validation(): return total_loss / count - for pass_id in xrange(args.pass_number): + for pass_id in xrange(args.pass_num): pass_start_time = time.time() words_seen = 0 for batch_id, data in enumerate(train_batch_generator()): @@ -323,7 +319,7 @@ def infer(): if __name__ == '__main__': args = parser.parse_args() - if args.mode == 'train': - train() - else: + if args.infer_only: infer() + else: + train() From ac2b8c4dd7ac7f4a5d1bd91fae9e836848ba8bed Mon Sep 17 00:00:00 2001 From: yangyaming Date: Tue, 16 Jan 2018 17:52:19 +0800 Subject: [PATCH 4/4] Not use peephole. --- fluid/machine_translation.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fluid/machine_translation.py b/fluid/machine_translation.py index b1f3e30..41c07b5 100644 --- a/fluid/machine_translation.py +++ b/fluid/machine_translation.py @@ -106,13 +106,16 @@ def bi_lstm_encoder(input_seq, gate_size): act='tanh', bias_attr=True) forward, _ = fluid.layers.dynamic_lstm( - input=input_forward_proj, size=gate_size * 4) + input=input_forward_proj, size=gate_size * 4, use_peepholes=False) input_reversed_proj = fluid.layers.fc(input=input_seq, size=gate_size * 4, act='tanh', bias_attr=True) reversed, _ = fluid.layers.dynamic_lstm( - input=input_reversed_proj, size=gate_size * 4, is_reverse=True) + input=input_reversed_proj, + size=gate_size * 4, + is_reverse=True, + use_peepholes=False) return forward, reversed src_word_idx = fluid.layers.data(