From bab8b651936c7c5de194471e2f8333c9e4bfe361 Mon Sep 17 00:00:00 2001 From: Hu Date: Thu, 3 Sep 2020 13:07:19 +0800 Subject: [PATCH 01/22] remove prev_len in hybrid_forward parameters --- scripts/conversion_toolkits/convert_gpt2.py | 4 ++-- src/gluonnlp/models/gpt2.py | 24 ++++++++++----------- tests/test_models_gpt2.py | 24 +++++++-------------- 3 files changed, 21 insertions(+), 31 deletions(-) diff --git a/scripts/conversion_toolkits/convert_gpt2.py b/scripts/conversion_toolkits/convert_gpt2.py index 7efe720922..fc23ed9809 100644 --- a/scripts/conversion_toolkits/convert_gpt2.py +++ b/scripts/conversion_toolkits/convert_gpt2.py @@ -170,8 +170,8 @@ def test_model(tf_model_path, gluon_model): # gluon model gl_input_ids = mx.np.array(input_ids, dtype=np.int32, ctx=ctx) - gl_logits_1, gl_states = gluon_model(gl_input_ids, gl_start_states, mx.np.array(0, dtype=np.int32, ctx=ctx)) - gl_logits_2, _ = gluon_model(gl_input_ids, gl_states, mx.np.array(seq_length, dtype=np.int32, ctx=ctx)) + gl_logits_1, gl_states = gluon_model(gl_input_ids, gl_start_states) + gl_logits_2, _ = gluon_model(gl_input_ids, gl_states) # tf model with tf.Session(graph=tf.Graph()) as sess: diff --git a/src/gluonnlp/models/gpt2.py b/src/gluonnlp/models/gpt2.py index f24bad0317..5ddc83d5e7 100644 --- a/src/gluonnlp/models/gpt2.py +++ b/src/gluonnlp/models/gpt2.py @@ -180,7 +180,7 @@ def __init__(self, units: int = 768, ) self.hidden_dropout = nn.Dropout(self._hidden_dropout_prob) - def hybrid_forward(self, F, x, layer_states, prev_len): + def hybrid_forward(self, F, x, layer_states): """ Parameters @@ -200,8 +200,10 @@ def hybrid_forward(self, F, x, layer_states, prev_len): x = self.ln(x) if self._layout == 'NT': batch_axis, time_axis = 0, 1 + prev_len = F.npx.shape_array(layer_states)[2] else: batch_axis, time_axis = 1, 0 + prev_len = F.npx.shape_array(layer_states)[1] query, key, value = F.np.split(self.qkv(x), 3, axis=-1) if layer_states is not None: @@ -333,7 +335,7 @@ def __init__(self, units: int = 768, dtype=self._dtype ) - def hybrid_forward(self, F, x, layer_states, prev_len): + def hybrid_forward(self, F, x, layer_states): """ Parameters @@ -349,8 +351,6 @@ def hybrid_forward(self, F, x, layer_states, prev_len): Shape (2, batch_size, prev_len, C_in) - layout = 'TN' Shape (2, prev_len, batch_size, C_in) - prev_len - The previous length Returns ------- @@ -366,7 +366,7 @@ def hybrid_forward(self, F, x, layer_states, prev_len): - layout = 'TN' Shape (2, prev_len + seq_length, batch_size, C_in) """ - h, new_layer_states = self.atten(x, layer_states, prev_len) + h, new_layer_states = self.atten(x, layer_states) x = x + h h = self.ffn(x) return h, new_layer_states @@ -451,7 +451,7 @@ def __init__(self, def layout(self): return self._layout - def hybrid_forward(self, F, x, states, prev_len): + def hybrid_forward(self, F, x, states): """ Parameters @@ -468,8 +468,6 @@ def hybrid_forward(self, F, x, states, prev_len): Shape (num_layers, 2, batch_size, prev_len, C_in)] - layout = 'TN' Shape (num_layers, 2, prev_len, batch_size, C_in)] - prev_len - The previous length. It will be a scalar. Returns ------- @@ -486,6 +484,8 @@ def hybrid_forward(self, F, x, states, prev_len): - layout = 'TN' Shape (num_layers, 2, prev_len + seq_length, batch_size, C_in) """ + prev_len = F.npx.shape_array(states)[3] if self._layout == 'NT' else \ + F.npx.shape_array(states)[2] x = self.get_initial_embedding(F, x, prev_len) if self._layout != self._compute_layout: @@ -495,7 +495,7 @@ def hybrid_forward(self, F, x, states, prev_len): new_states = [] for layer_idx in range(self._num_layers): layer_states = None if states is None else states[layer_idx] - x, new_layer_states = self._layers[layer_idx](x, layer_states, prev_len) + x, new_layer_states = self._layers[layer_idx](x, layer_states) new_states.append(new_layer_states) new_states = F.np.stack(new_states, axis=0) @@ -609,7 +609,7 @@ def __init__(self, backbone_cfg=None): ) self._lm_head.weight = self._backbone_model._embed.weight - def hybrid_forward(self, F, inputs, states, prev_len): + def hybrid_forward(self, F, inputs, states): """Getting the logits Parameters @@ -626,8 +626,6 @@ def hybrid_forward(self, F, inputs, states, prev_len): Shape (num_layers, 2, batch_size, prev_len, C_in) - layout = 'TN' Shape (num_layers, 2, prev_len, batch_size, C_in) - prev_len - Will be a scalar that represents the previous length Returns ------- @@ -642,7 +640,7 @@ def hybrid_forward(self, F, inputs, states, prev_len): - layout = 'TN' Shape (num_layers, 2, prev_len + seq_length, batch_size, C_in) """ - contextual_embeddings, new_states = self._backbone_model(inputs, states, prev_len) + contextual_embeddings, new_states = self._backbone_model(inputs, states) logits = self._lm_head(contextual_embeddings) return logits, new_states diff --git a/tests/test_models_gpt2.py b/tests/test_models_gpt2.py index 25f3ef6977..8de52efb16 100644 --- a/tests/test_models_gpt2.py +++ b/tests/test_models_gpt2.py @@ -41,16 +41,14 @@ def test_gpt2_small_config(compute_layout, ctx): gpt2_model.hybridize() hiddens, _ = gpt2_model( inputs, - gpt2_model.init_states(batch_size, ctx), - mx.np.array(0, dtype=np.int32, ctx=ctx) + gpt2_model.init_states(batch_size, ctx) ) gpt2_model_tn = GPT2Model.from_cfg(cfg_tn) gpt2_model_tn.share_parameters(gpt2_model.collect_params()) gpt2_model_tn.hybridize() hiddens_tn, _ = gpt2_model_tn( inputs.T, - gpt2_model_tn.init_states(batch_size, ctx), - mx.np.array(0, dtype=np.int32, ctx=ctx) + gpt2_model_tn.init_states(batch_size, ctx) ) assert_allclose(np.swapaxes(hiddens_tn.asnumpy(), 0, 1), hiddens.asnumpy(), 1E-4, 1E-4) @@ -61,16 +59,14 @@ def test_gpt2_small_config(compute_layout, ctx): gpt2_lm_model.hybridize() logits, states = gpt2_lm_model( inputs, - gpt2_lm_model.init_states(batch_size, ctx), - mx.np.array(0, dtype=np.int32, ctx=ctx) + gpt2_lm_model.init_states(batch_size, ctx) ) gpt2_lm_model_tn = GPT2ForLM(cfg_tn) gpt2_lm_model_tn.share_parameters(gpt2_lm_model.collect_params()) gpt2_lm_model_tn.hybridize() logits_tn, states_tn = gpt2_lm_model_tn( inputs.T, - gpt2_lm_model_tn.init_states(batch_size, ctx), - mx.np.array(0, dtype=np.int32, ctx=ctx) + gpt2_lm_model_tn.init_states(batch_size, ctx) ) assert_allclose(np.swapaxes(logits_tn.asnumpy(), 0, 1), logits.asnumpy(), 1E-4, 1E-4) @@ -91,8 +87,7 @@ def test_gpt2_incremental_states(ctx): one_time_hiddens, one_time_states = gpt2_model( inputs, - gpt2_model.init_states(batch_size, ctx), - mx.np.array(0, dtype=np.int32, ctx=ctx) + gpt2_model.init_states(batch_size, ctx) ) states = gpt2_model.init_states(batch_size, ctx) @@ -100,8 +95,7 @@ def test_gpt2_incremental_states(ctx): for i in range(sequence_length): hiddens, states = gpt2_model( inputs[:, i:i+1], - states, - mx.np.array(i, dtype=np.int32, ctx=ctx) + states ) hiddens_l.append(hiddens) hiddens_concat = mx.np.concatenate(hiddens_l, axis=1) @@ -143,8 +137,7 @@ def test_gpt2(model_name, ctx): ) logits, _ = gpt2_lm_model( input_ids, - gpt2_lm_model.init_states(batch_size, ctx), - mx.np.array(0, dtype=np.int32, ctx=ctx) + gpt2_lm_model.init_states(batch_size, ctx) ) mx.npx.waitall() # test backward @@ -152,8 +145,7 @@ def test_gpt2(model_name, ctx): with mx.autograd.record(): logits, _ = gpt2_lm_model( input_ids, - gpt2_lm_model.init_states(batch_size, ctx), - mx.np.array(0, dtype=np.int32, ctx=ctx) + gpt2_lm_model.init_states(batch_size, ctx) ) loss = label_smooth_loss(logits, input_ids) loss.backward() From b8268aed8713d6c86d52a83492d78bc339bc060c Mon Sep 17 00:00:00 2001 From: Hu Date: Thu, 3 Sep 2020 13:29:04 +0800 Subject: [PATCH 02/22] update --- src/gluonnlp/models/gpt2.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/gluonnlp/models/gpt2.py b/src/gluonnlp/models/gpt2.py index 5ddc83d5e7..f9abc5d3bc 100644 --- a/src/gluonnlp/models/gpt2.py +++ b/src/gluonnlp/models/gpt2.py @@ -195,7 +195,6 @@ def hybrid_forward(self, F, x, layer_states): Shape (2, batch_size, prev_len, C_in) - layout = 'TN' Shape (2, prev_len, batch_size, C_in) - prev_len """ x = self.ln(x) if self._layout == 'NT': From fe5a93fb3aac0a28a5288e3468de4dbd930382f9 Mon Sep 17 00:00:00 2001 From: Hu Date: Sat, 5 Sep 2020 14:08:44 +0800 Subject: [PATCH 03/22] sample --- scripts/genertate/sample_gpt2.py | 109 +++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 scripts/genertate/sample_gpt2.py diff --git a/scripts/genertate/sample_gpt2.py b/scripts/genertate/sample_gpt2.py new file mode 100644 index 0000000000..53dbe0d42e --- /dev/null +++ b/scripts/genertate/sample_gpt2.py @@ -0,0 +1,109 @@ +import numpy as np +import random +import os +import mxnet as mx +from mxnet import gluon +import argparse +import logging +import time +from gluonnlp.utils.misc import logging_config +from gluonnlp.models.transformer import TransformerModel,\ + TransformerNMTInference +from gluonnlp.data.batchify import Tuple, Pad, Stack +from gluonnlp.data.filtering import MosesNormalizer +from gluonnlp.data import tokenizers +from gluonnlp.sequence_sampler import BeamSearchSampler, BaseStepDecoder +import sacrebleu +from tqdm import tqdm + +from gluonnlp.models.gpt2 import GPT2ForLM, list_pretrained_gpt2, get_pretrained_gpt2 + +mx.npx.set_np() + + +def parse_args(): + parser = argparse.ArgumentParser( + description='') + parser.add_argument('--model_name', type=str, default='gpt2_124M', + choices=list_pretrained_gpt2(), help='') + parser.add_argument('--seed', type=int, default=None, help='The random seed.') + parser.add_argument('--nsamples', type=int, default=0, help='') + parser.add_argument('--batch_size', type=int, default=1, help='') + parser.add_argument('--length', type=int, default=None, help='') + parser.add_argument('--temperature', type=float, default=1.0, help='') + parser.add_argument('--top_k', type=int, default=-1, help='') + parser.add_argument('--top_p', type=float, required=-1.0, help='') + parser.add_argument('--gpu', type=int, default=0, help='') + + + +# input = prev , states = None, output += new samples () +# 输入start token时 一直 +# 输入context 时 + + +class GPT2Decoder(BaseStepDecoder): + def __init__(self, gpt2_lm_model): + self._gpt2_lm_model = gpt2_lm_model + @property + def state_batch_axis(self): + return 2 if self._gpt2_lm_model._backbone_model.layout == 'NT' else 3 + def init_states(self, batch_size, ctx): + return self._gpt2_lm_model.init_states(batch_size, ctx) + def __call__(self, data, states): + return self._gpt2_lm_model(data, states) + + +def sample_gpt2(args): + ctx = mx.gpu(args.gpu) if args.gpu is not None else \ + mx.cpu() + + cfg, tokenizer, _, lm_params_path = get_pretrained_gpt2( + model_name=args.model_name, + load_backbone=False, + load_lm=True) + + if args.length is None: + args.length = cfg.MODEL.max_length + assert args.length <= cfg.MODEL.max_length, \ + "Can't get samples longer than window size: {}".format(cfg.MODEL.max_length) + + model = GPT2ForLM(cfg) + model.hybridize() + model.load_parameters(lm_params_path, ctx=ctx) + gpt2decoder = GPT2Decoder(model) + + sampler = BeamSearchSampler( + beam_size=1, + decoder=gpt2decoder, + eos_id=tokenizer.eos_id, + vocab_size=cfg.MODEL.vocab_size, + max_length_a=0, + max_length_b=cfg.MODEL.max_length, + min_length=1, + temperature=args.temperature, + sampling=True, + sampling_topp=args.top_p, + sampling_topk=args.top_k, + early_return=True + ) + + start_input = mx.np.full((args.batch_size, 1), tokenizer.bos_id) + start_states = gpt2decoder.init_states(args.batch_size, ctx) + + generated = 0 + while args.nsamples <= 0 or generated < args.nsamples: + samples = sampler(start_input, start_states) + for i in args.batch_size: + text = tokenizer.decode(samples[i][0]) + print(text) + generated += args.batch_size + + +if __name__ == '__main__': + os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round' + args = parse_args() + np.random.seed(args.seed) + mx.random.seed(args.seed) + random.seed(args.seed) + sample_gpt2(args) \ No newline at end of file From c5d5ef322feb2fd8ad1ca36468faf51c942186c3 Mon Sep 17 00:00:00 2001 From: Hu Date: Sat, 5 Sep 2020 14:43:04 +0800 Subject: [PATCH 04/22] update --- scripts/genertate/sample_gpt2.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/scripts/genertate/sample_gpt2.py b/scripts/genertate/sample_gpt2.py index 53dbe0d42e..e724fb9f57 100644 --- a/scripts/genertate/sample_gpt2.py +++ b/scripts/genertate/sample_gpt2.py @@ -26,15 +26,15 @@ def parse_args(): description='') parser.add_argument('--model_name', type=str, default='gpt2_124M', choices=list_pretrained_gpt2(), help='') - parser.add_argument('--seed', type=int, default=None, help='The random seed.') + parser.add_argument('--seed', type=int, default=100, help='The random seed.') parser.add_argument('--nsamples', type=int, default=0, help='') parser.add_argument('--batch_size', type=int, default=1, help='') parser.add_argument('--length', type=int, default=None, help='') parser.add_argument('--temperature', type=float, default=1.0, help='') parser.add_argument('--top_k', type=int, default=-1, help='') - parser.add_argument('--top_p', type=float, required=-1.0, help='') + parser.add_argument('--top_p', type=float, default=-1.0, help='') parser.add_argument('--gpu', type=int, default=0, help='') - + return parser.parse_args() # input = prev , states = None, output += new samples () @@ -51,7 +51,8 @@ def state_batch_axis(self): def init_states(self, batch_size, ctx): return self._gpt2_lm_model.init_states(batch_size, ctx) def __call__(self, data, states): - return self._gpt2_lm_model(data, states) + logits, new_states = self._gpt2_lm_model(data, states) + return logits[:,-1,:], new_states def sample_gpt2(args): @@ -76,7 +77,7 @@ def sample_gpt2(args): sampler = BeamSearchSampler( beam_size=1, decoder=gpt2decoder, - eos_id=tokenizer.eos_id, + eos_id=tokenizer.vocab.eos_id, vocab_size=cfg.MODEL.vocab_size, max_length_a=0, max_length_b=cfg.MODEL.max_length, @@ -88,14 +89,14 @@ def sample_gpt2(args): early_return=True ) - start_input = mx.np.full((args.batch_size, 1), tokenizer.bos_id) + start_input = mx.np.full((args.batch_size, 1), tokenizer.vocab.eos_id, ctx=ctx) start_states = gpt2decoder.init_states(args.batch_size, ctx) generated = 0 while args.nsamples <= 0 or generated < args.nsamples: samples = sampler(start_input, start_states) for i in args.batch_size: - text = tokenizer.decode(samples[i][0]) + text = tokenizer.decode(samples[i][0].asnumpy()) print(text) generated += args.batch_size From 59ce7cbb254f963049538824093c56d0323c3535 Mon Sep 17 00:00:00 2001 From: Hu Date: Sun, 6 Sep 2020 13:27:54 +0800 Subject: [PATCH 05/22] add gpt2_1558M --- scripts/conversion_toolkits/convert_gpt2.sh | 2 +- src/gluonnlp/models/gpt2.py | 17 +++++++++ .../models/model_zoo_checksums/gpt2.txt | 35 +++++++++++-------- 3 files changed, 38 insertions(+), 16 deletions(-) diff --git a/scripts/conversion_toolkits/convert_gpt2.sh b/scripts/conversion_toolkits/convert_gpt2.sh index a551250c4b..febc2ec2f8 100644 --- a/scripts/conversion_toolkits/convert_gpt2.sh +++ b/scripts/conversion_toolkits/convert_gpt2.sh @@ -1,6 +1,6 @@ python3 -m pip install tensorflow==1.15 --upgrade --user git clone https://github.com/openai/gpt-2.git gpt_2 -for model in 124M 355M 774M +for model in 124M 355M 774M 1558M do python3 gpt_2/download_model.py ${model} mkdir gpt2_${model} diff --git a/src/gluonnlp/models/gpt2.py b/src/gluonnlp/models/gpt2.py index f9abc5d3bc..206bd7b372 100644 --- a/src/gluonnlp/models/gpt2.py +++ b/src/gluonnlp/models/gpt2.py @@ -96,6 +96,16 @@ def gpt2_774M(): cfg.freeze() return cfg +@gpt2_cfg_reg.register() +def gpt2_1558M(): + cfg = gpt2_124M() + cfg.defrost() + cfg.MODEL.num_heads = 25 + cfg.MODEL.num_layers = 48 + cfg.MODEL.units = 1600 + cfg.freeze() + return cfg + PRETRAINED_URL = { 'gpt2_124M': { 'cfg': gpt2_124M(), @@ -118,6 +128,13 @@ def gpt2_774M(): 'params': 'gpt2_774M/model-9917e24e.params', 'lm_params': 'gpt2_774M/model_lm-cfbfa641.params' }, + 'gpt2_1558M': { + 'cfg': gpt2_1558M(), + 'merges': 'gpt2_1558M/gpt2-396d4d8e.merges', + 'vocab': 'gpt2_1558M/gpt2-9dc62091.vocab', + 'params': 'gpt2_1558M/model-af3dd713.params', + 'lm_params': 'gpt2_1558M/model_lm-c8489dcb.params' + }, } diff --git a/src/gluonnlp/models/model_zoo_checksums/gpt2.txt b/src/gluonnlp/models/model_zoo_checksums/gpt2.txt index f117b813d1..a315a622e6 100644 --- a/src/gluonnlp/models/model_zoo_checksums/gpt2.txt +++ b/src/gluonnlp/models/model_zoo_checksums/gpt2.txt @@ -1,15 +1,20 @@ -gpt2_124M/model-fac1f39c.yml fac1f39c804e324c69162b9b37bd24ab98241612 424 -gpt2_124M/model_lm-99b90604.params 99b9060488b4542ccd045c28401da10a3158ca80 497771820 -gpt2_124M/gpt2-396d4d8e.merges 396d4d8ec90cb02f4d56e049e0e4add868bcd943 456318 -gpt2_124M/gpt2-9dc62091.vocab 9dc620913410d5ec1a988abf852891e1c9f0f649 558055 -gpt2_124M/model-bfed311d.params bfed311d5c980ba475f90ccf7f536d25c3b40386 497769466 -gpt2_355M/model-2aea05ff.yml 2aea05ff1e67ef816b3f824102da8b7b1292a620 425 -gpt2_355M/model_lm-eed0e964.params eed0e964f4222823a557acfee2c106f228ce0188 1419317644 -gpt2_355M/gpt2-396d4d8e.merges 396d4d8ec90cb02f4d56e049e0e4add868bcd943 456318 -gpt2_355M/gpt2-9dc62091.vocab 9dc620913410d5ec1a988abf852891e1c9f0f649 558055 -gpt2_355M/model-81dee612.params 81dee612413733899f6e5fbbeac91da781805e1b 1419312986 -gpt2_774M/model-c9555788.yml c95557880783ec4f94b09b5b045c8d9e9a198e4d 425 -gpt2_774M/model_lm-cfbfa641.params cfbfa6419aaf1eae480fba5a1a7c8ea6096d43d6 3096157676 -gpt2_774M/gpt2-396d4d8e.merges 396d4d8ec90cb02f4d56e049e0e4add868bcd943 456318 -gpt2_774M/gpt2-9dc62091.vocab 9dc620913410d5ec1a988abf852891e1c9f0f649 558055 -gpt2_774M/model-9917e24e.params 9917e24e89c651793adea69042d6cceddfc7973c 3096150714 +gpt2_124M/model-fac1f39c.yml fac1f39c804e324c69162b9b37bd24ab98241612 424 +gpt2_124M/model_lm-99b90604.params 99b9060488b4542ccd045c28401da10a3158ca80 497771820 +gpt2_124M/gpt2-396d4d8e.merges 396d4d8ec90cb02f4d56e049e0e4add868bcd943 456318 +gpt2_124M/gpt2-9dc62091.vocab 9dc620913410d5ec1a988abf852891e1c9f0f649 558055 +gpt2_124M/model-bfed311d.params bfed311d5c980ba475f90ccf7f536d25c3b40386 497769466 +gpt2_355M/model-2aea05ff.yml 2aea05ff1e67ef816b3f824102da8b7b1292a620 425 +gpt2_355M/model_lm-eed0e964.params eed0e964f4222823a557acfee2c106f228ce0188 1419317644 +gpt2_355M/gpt2-396d4d8e.merges 396d4d8ec90cb02f4d56e049e0e4add868bcd943 456318 +gpt2_355M/gpt2-9dc62091.vocab 9dc620913410d5ec1a988abf852891e1c9f0f649 558055 +gpt2_355M/model-81dee612.params 81dee612413733899f6e5fbbeac91da781805e1b 1419312986 +gpt2_774M/model-c9555788.yml c95557880783ec4f94b09b5b045c8d9e9a198e4d 425 +gpt2_774M/model_lm-cfbfa641.params cfbfa6419aaf1eae480fba5a1a7c8ea6096d43d6 3096157676 +gpt2_774M/gpt2-396d4d8e.merges 396d4d8ec90cb02f4d56e049e0e4add868bcd943 456318 +gpt2_774M/gpt2-9dc62091.vocab 9dc620913410d5ec1a988abf852891e1c9f0f649 558055 +gpt2_774M/model-9917e24e.params 9917e24e89c651793adea69042d6cceddfc7973c 3096150714 +gpt2_1558M/model-3f8e3175.yml 3f8e3175b9faad1cc95df2f542b486698aeb5665 425 +gpt2_1558M/model_lm-c8489dcb.params c8489dcbdb0d39bc3eac6d1d62e0e3dace9faa8f 6230494540 +gpt2_1558M/gpt2-396d4d8e.merges 396d4d8ec90cb02f4d56e049e0e4add868bcd943 456318 +gpt2_1558M/gpt2-9dc62091.vocab 9dc620913410d5ec1a988abf852891e1c9f0f649 558055 +gpt2_1558M/model-af3dd713.params af3dd71313b55b4be5f52bdd538c9db054c1e190 6230485274 From aae228ca57b959f886c0489e9923bbb27ef5eb6d Mon Sep 17 00:00:00 2001 From: Hu Date: Sun, 6 Sep 2020 14:28:55 +0800 Subject: [PATCH 06/22] update --- ...=> generate_unconditional_gpt2_samples.py} | 61 +++++++++---------- 1 file changed, 28 insertions(+), 33 deletions(-) rename scripts/genertate/{sample_gpt2.py => generate_unconditional_gpt2_samples.py} (63%) diff --git a/scripts/genertate/sample_gpt2.py b/scripts/genertate/generate_unconditional_gpt2_samples.py similarity index 63% rename from scripts/genertate/sample_gpt2.py rename to scripts/genertate/generate_unconditional_gpt2_samples.py index e724fb9f57..e8e7cd2839 100644 --- a/scripts/genertate/sample_gpt2.py +++ b/scripts/genertate/generate_unconditional_gpt2_samples.py @@ -2,46 +2,38 @@ import random import os import mxnet as mx -from mxnet import gluon import argparse -import logging -import time -from gluonnlp.utils.misc import logging_config -from gluonnlp.models.transformer import TransformerModel,\ - TransformerNMTInference -from gluonnlp.data.batchify import Tuple, Pad, Stack -from gluonnlp.data.filtering import MosesNormalizer -from gluonnlp.data import tokenizers from gluonnlp.sequence_sampler import BeamSearchSampler, BaseStepDecoder -import sacrebleu -from tqdm import tqdm - from gluonnlp.models.gpt2 import GPT2ForLM, list_pretrained_gpt2, get_pretrained_gpt2 mx.npx.set_np() - def parse_args(): parser = argparse.ArgumentParser( - description='') + description='GPT-2 unconditional sampler. Load a GPT-2 model and sample.') parser.add_argument('--model_name', type=str, default='gpt2_124M', - choices=list_pretrained_gpt2(), help='') - parser.add_argument('--seed', type=int, default=100, help='The random seed.') - parser.add_argument('--nsamples', type=int, default=0, help='') - parser.add_argument('--batch_size', type=int, default=1, help='') - parser.add_argument('--length', type=int, default=None, help='') - parser.add_argument('--temperature', type=float, default=1.0, help='') - parser.add_argument('--top_k', type=int, default=-1, help='') - parser.add_argument('--top_p', type=float, default=-1.0, help='') - parser.add_argument('--gpu', type=int, default=0, help='') + choices=list_pretrained_gpt2(), help='Model name') + parser.add_argument('--seed', type=int, default=100, help='The random seed') + parser.add_argument('--nsamples', type=int, default=0, help='Number of samples to return') + parser.add_argument('--batch_size', type=int, default=1, help='Number of batches') + parser.add_argument('--length', type=int, default=None, + help='Number of tokens in generated text, if None (default), is ' + 'determined by model max_length') + parser.add_argument('--temperature', type=float, default=1.0, + help='') + parser.add_argument('--top_k', type=int, default=-1, + help='Multinomial sampling with topk, ' + 'see [ACL2018] "Hierarchical Neural Story Generation"' + 'https://www.aclweb.org/anthology/P18-1082.pdf') + parser.add_argument('--top_p', type=float, default=-1.0, + help='Multinomial sampling with topp, ' + 'see [ICLR2020] "The Curious Case of Neural Text Degeneration"' + 'https://arxiv.org/abs/1904.09751') + parser.add_argument('--gpu', type=int, default=0, + help='Which gpu to use, set None to use cpu') return parser.parse_args() -# input = prev , states = None, output += new samples () -# 输入start token时 一直 -# 输入context 时 - - class GPT2Decoder(BaseStepDecoder): def __init__(self, gpt2_lm_model): self._gpt2_lm_model = gpt2_lm_model @@ -51,6 +43,7 @@ def state_batch_axis(self): def init_states(self, batch_size, ctx): return self._gpt2_lm_model.init_states(batch_size, ctx) def __call__(self, data, states): + data = mx.npx.reshape(data, (-1, 1)) logits, new_states = self._gpt2_lm_model(data, states) return logits[:,-1,:], new_states @@ -80,13 +73,13 @@ def sample_gpt2(args): eos_id=tokenizer.vocab.eos_id, vocab_size=cfg.MODEL.vocab_size, max_length_a=0, - max_length_b=cfg.MODEL.max_length, + max_length_b=args.length, min_length=1, temperature=args.temperature, sampling=True, sampling_topp=args.top_p, sampling_topk=args.top_k, - early_return=True + early_return=False ) start_input = mx.np.full((args.batch_size, 1), tokenizer.vocab.eos_id, ctx=ctx) @@ -94,9 +87,11 @@ def sample_gpt2(args): generated = 0 while args.nsamples <= 0 or generated < args.nsamples: - samples = sampler(start_input, start_states) - for i in args.batch_size: - text = tokenizer.decode(samples[i][0].asnumpy()) + samples, _, _ = sampler(start_input, start_states) + for i in range(args.batch_size): + ids = samples[i][0].asnumpy().tolist() + ids = ids[1:ids.index(-1)] + text = tokenizer.decode(ids) print(text) generated += args.batch_size From e6c75fb3e9b3feb8d5ccca44427047ed336d3c40 Mon Sep 17 00:00:00 2001 From: Hu Date: Sun, 6 Sep 2020 14:41:21 +0800 Subject: [PATCH 07/22] update --- .../generate_unconditional_gpt2_samples.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/scripts/genertate/generate_unconditional_gpt2_samples.py b/scripts/genertate/generate_unconditional_gpt2_samples.py index e8e7cd2839..64f167d443 100644 --- a/scripts/genertate/generate_unconditional_gpt2_samples.py +++ b/scripts/genertate/generate_unconditional_gpt2_samples.py @@ -13,7 +13,7 @@ def parse_args(): description='GPT-2 unconditional sampler. Load a GPT-2 model and sample.') parser.add_argument('--model_name', type=str, default='gpt2_124M', choices=list_pretrained_gpt2(), help='Model name') - parser.add_argument('--seed', type=int, default=100, help='The random seed') + parser.add_argument('--seed', type=int, default=None, help='The random seed') parser.add_argument('--nsamples', type=int, default=0, help='Number of samples to return') parser.add_argument('--batch_size', type=int, default=1, help='Number of batches') parser.add_argument('--length', type=int, default=None, @@ -90,8 +90,10 @@ def sample_gpt2(args): samples, _, _ = sampler(start_input, start_states) for i in range(args.batch_size): ids = samples[i][0].asnumpy().tolist() - ids = ids[1:ids.index(-1)] + ids = ids[1:ids.index(-1)] if -1 in ids else \ + ids[1:] text = tokenizer.decode(ids) + print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) print(text) generated += args.batch_size @@ -99,7 +101,8 @@ def sample_gpt2(args): if __name__ == '__main__': os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round' args = parse_args() - np.random.seed(args.seed) - mx.random.seed(args.seed) - random.seed(args.seed) + if args.seed is not None: + np.random.seed(args.seed) + mx.random.seed(args.seed) + random.seed(args.seed) sample_gpt2(args) \ No newline at end of file From 4ee0ea6c7f51fb43be9793a0e92fdaab44e3b6cd Mon Sep 17 00:00:00 2001 From: Hu Date: Mon, 7 Sep 2020 20:05:00 +0800 Subject: [PATCH 08/22] update --- scripts/genertate/generate_unconditional_gpt2_samples.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/genertate/generate_unconditional_gpt2_samples.py b/scripts/genertate/generate_unconditional_gpt2_samples.py index 64f167d443..2f2d217175 100644 --- a/scripts/genertate/generate_unconditional_gpt2_samples.py +++ b/scripts/genertate/generate_unconditional_gpt2_samples.py @@ -70,7 +70,7 @@ def sample_gpt2(args): sampler = BeamSearchSampler( beam_size=1, decoder=gpt2decoder, - eos_id=tokenizer.vocab.eos_id, + eos_id=None, vocab_size=cfg.MODEL.vocab_size, max_length_a=0, max_length_b=args.length, @@ -89,13 +89,13 @@ def sample_gpt2(args): while args.nsamples <= 0 or generated < args.nsamples: samples, _, _ = sampler(start_input, start_states) for i in range(args.batch_size): + generated += 1 ids = samples[i][0].asnumpy().tolist() ids = ids[1:ids.index(-1)] if -1 in ids else \ ids[1:] text = tokenizer.decode(ids) print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) print(text) - generated += args.batch_size if __name__ == '__main__': From e300649c851b7cd5c858c2811576c23002304b13 Mon Sep 17 00:00:00 2001 From: Hu Date: Tue, 8 Sep 2020 00:32:21 +0800 Subject: [PATCH 09/22] update --- scripts/genertate/README.md | 15 +++++++++ scripts/genertate/calculate_metrics.py | 43 ++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 scripts/genertate/README.md create mode 100644 scripts/genertate/calculate_metrics.py diff --git a/scripts/genertate/README.md b/scripts/genertate/README.md new file mode 100644 index 0000000000..da6188342d --- /dev/null +++ b/scripts/genertate/README.md @@ -0,0 +1,15 @@ +Use the following command to generate gpt2 samples +```bash +python generate_unconditional_gpt2_samples.py \ + --model_name gpt2_774M + --nsamples 5000 > samples +``` + +Some metrics for the unconditional generated text + +| GPT2 774M | Perplexity | Self-BLEU4 |Zipf Coefficient| Repetition % | +|---------------|--------------|----------------|----------------|----------------| +| openai | | | - | - | +| gluon | | - | - | - | +| | | - | - | - | + diff --git a/scripts/genertate/calculate_metrics.py b/scripts/genertate/calculate_metrics.py new file mode 100644 index 0000000000..0299bccf9e --- /dev/null +++ b/scripts/genertate/calculate_metrics.py @@ -0,0 +1,43 @@ +import re + +import argparse +import sacrebleu + + +def parse_args(): + parser = argparse.ArgumentParser( + description='') + parser.add_argument('--generated_file', type=str, required=True, help='Model name') + return parser.parse_args() + + +def calculate_self_bleu4(samples): + pass + +def calculate_zipf_coefficient(): + pass + +def calculate_repetition(): + pass + +def calculate_metrics(args): + with open(args.generated_file, encoding='utf-8') as of: + samples = of.read() + pattern = '='*40 + ' SAMPLE \d+ ' + '='*40 + '\n' + samples = re.split(pattern, samples)[1:] + + # self bleu4 + self_bleu4 = calculate_self_bleu4(samples) + + # zipf coefficient + zipf_coefficient = calculate_zipf_coefficient() + + # repetition + repetition = calculate_repetition() + + print() + +if __name__ == '__main__': + args = parse_args() + calculate_metrics(args) + \ No newline at end of file From a272d8dddd32ccb18c9d2df2ac3836b1bf44d727 Mon Sep 17 00:00:00 2001 From: Hu Date: Thu, 10 Sep 2020 03:02:38 +0800 Subject: [PATCH 10/22] update --- scripts/genertate/README.md | 22 +++- scripts/genertate/calculate_metrics.py | 90 +++++++++++--- .../generate_unconditional_gpt2_samples.py | 2 +- .../interactive_conditional_gpt2_samples.py | 116 ++++++++++++++++++ 4 files changed, 209 insertions(+), 21 deletions(-) create mode 100644 scripts/genertate/interactive_conditional_gpt2_samples.py diff --git a/scripts/genertate/README.md b/scripts/genertate/README.md index da6188342d..834d16e688 100644 --- a/scripts/genertate/README.md +++ b/scripts/genertate/README.md @@ -1,10 +1,28 @@ + +Some of the examples below may include Unicode text characters. Set the environment variable: +```bash +export PYTHONIOENCODING=UTF-8 +``` + Use the following command to generate gpt2 samples ```bash -python generate_unconditional_gpt2_samples.py \ - --model_name gpt2_774M +python3 generate_unconditional_gpt2_samples.py \ + --model_name gpt2_774M \ --nsamples 5000 > samples ``` +```bash +python3 interactive_conditional_gpt2_samples.py \ + --model_name gpt2_774M \ + --nsamples 1 +``` + +```bash +python3 calculate_metrics.py samples +``` + +# 贴点sample + Some metrics for the unconditional generated text | GPT2 774M | Perplexity | Self-BLEU4 |Zipf Coefficient| Repetition % | diff --git a/scripts/genertate/calculate_metrics.py b/scripts/genertate/calculate_metrics.py index 0299bccf9e..369120b9fe 100644 --- a/scripts/genertate/calculate_metrics.py +++ b/scripts/genertate/calculate_metrics.py @@ -1,41 +1,95 @@ import re - import argparse import sacrebleu +from collections import Counter +import operator +import numpy as np +from scipy import stats +import os +import random +from gluonnlp.base import get_model_zoo_home_dir, get_repo_model_zoo_url, get_model_zoo_checksum_dir +from gluonnlp.utils.misc import load_checksum_stats, download +from gluonnlp.data.tokenizers import HuggingFaceByteBPETokenizer def parse_args(): parser = argparse.ArgumentParser( - description='') + description='Calculate metrics for the generated sentences') parser.add_argument('--generated_file', type=str, required=True, help='Model name') + parser.add_argument('--num_samples', type=int, default=1000, help='') + parser.add_argument('--num_bleu_samples', type=int, default=1000, help='') return parser.parse_args() -def calculate_self_bleu4(samples): - pass +def calculate_self_bleu4(samples, num_bleu_samples): + sys_indices = random.sample(range(len(samples)), num_bleu_samples) + res = 0 + for sys_indice in sys_indices: + # remove it self + ref = samples[:sys_indice] + samples[sys_indice+1:] + sacrebleu_out = sacrebleu.corpus_bleu( + sys_stream=samples[sys_indice], + ref_streams=ref) + res += sacrebleu_out.score + res /= len(samples) + return res + -def calculate_zipf_coefficient(): - pass +def calculate_zipf_coefficient(sample_ids, tokenizer): + cnt = Counter() + for sample_id in sample_ids: + cnt.update(sample_id) + + xs = np.arange(1, min(len(cnt), len(tokenizer.vocab))) + ys = np.array(sorted(cnt.values(), key=operator.neg)[:len(tokenizer.vocab)]) + _, _, r, _, _ = stats.linregress(np.log(xs), np.log(ys)) + return r -def calculate_repetition(): - pass + +def calculate_repetition(sample_ids): + max_n = 90 + res = 0 + for sample_id in sample_ids: + rev = list(reversed(sample_id)) + last_n_repeats = [0 for _ in range(max_n)] + for n in range(1, max_n + 1): + n_repeat = 1 + while len(rev[n*n_repeat:n*(n_repeat+1)]) == n and \ + rev[n*n_repeat:n*(n_repeat+1)] == rev[:n]: + n_repeat += 1 + last_n_repeat[n-1] = n_repeat +# res += (sum(last_n_repeat) / ) TODO + def calculate_metrics(args): with open(args.generated_file, encoding='utf-8') as of: samples = of.read() pattern = '='*40 + ' SAMPLE \d+ ' + '='*40 + '\n' samples = re.split(pattern, samples)[1:] + samples = samples[:args.num_samples] + assert len(samples) == args.num_samples - # self bleu4 - self_bleu4 = calculate_self_bleu4(samples) - - # zipf coefficient - zipf_coefficient = calculate_zipf_coefficient() - - # repetition - repetition = calculate_repetition() - - print() + local_paths = {} + download_jobs = [('vocab', 'gpt2_124M/gpt2-9dc62091.vocab'), + ('merges', 'gpt2_124M/gpt2-396d4d8e.merges')] + FILE_STATS = load_checksum_stats(os.path.join(get_model_zoo_checksum_dir(), 'gpt2.txt')) + for k, path in download_jobs: + local_paths[k] = download(url=get_repo_model_zoo_url() + path, + path=os.path.join(get_model_zoo_home_dir(), path), + sha1_hash=FILE_STATS[path]) + tokenizer = HuggingFaceByteBPETokenizer( + merges_file=local_paths['merges'], + vocab_file=local_paths['vocab']) + sample_ids = tokenizer.encode(samples, output_type=int) + + self_bleu4 = calculate_self_bleu4(samples, args.num_bleu_samples) + zipf_coefficient = calculate_zipf_coefficient(sample_ids, tokenizer) + repetition = calculate_repetition(sample_ids) + print('Self BLEU 4: {}\n' + 'Zipf coefficient: {}\n' + 'Repectition: {}\n' + .format(self_bleu4, zipf_coefficient, repetition)) + if __name__ == '__main__': args = parse_args() diff --git a/scripts/genertate/generate_unconditional_gpt2_samples.py b/scripts/genertate/generate_unconditional_gpt2_samples.py index 2f2d217175..e16214ab42 100644 --- a/scripts/genertate/generate_unconditional_gpt2_samples.py +++ b/scripts/genertate/generate_unconditional_gpt2_samples.py @@ -105,4 +105,4 @@ def sample_gpt2(args): np.random.seed(args.seed) mx.random.seed(args.seed) random.seed(args.seed) - sample_gpt2(args) \ No newline at end of file + sample_gpt2(args) diff --git a/scripts/genertate/interactive_conditional_gpt2_samples.py b/scripts/genertate/interactive_conditional_gpt2_samples.py new file mode 100644 index 0000000000..ce1fd603a8 --- /dev/null +++ b/scripts/genertate/interactive_conditional_gpt2_samples.py @@ -0,0 +1,116 @@ +import numpy as np +import random +import os +import mxnet as mx +import argparse +from gluonnlp.sequence_sampler import BeamSearchSampler, BaseStepDecoder +from gluonnlp.models.gpt2 import GPT2ForLM, list_pretrained_gpt2, get_pretrained_gpt2 + +mx.npx.set_np() + +def parse_args(): + parser = argparse.ArgumentParser( + description='GPT-2 unconditional sampler. Load a GPT-2 model and sample.') + parser.add_argument('--model_name', type=str, default='gpt2_124M', + choices=list_pretrained_gpt2(), help='Model name') + parser.add_argument('--seed', type=int, default=None, help='The random seed') + parser.add_argument('--nsamples', type=int, default=0, help='Number of samples to return') + parser.add_argument('--batch_size', type=int, default=1, help='Number of batches') + parser.add_argument('--length', type=int, default=None, + help='Number of tokens in generated text, if None (default), is ' + 'determined by model max_length') + parser.add_argument('--temperature', type=float, default=1.0, + help='') + parser.add_argument('--top_k', type=int, default=-1, + help='Multinomial sampling with topk, ' + 'see [ACL2018] "Hierarchical Neural Story Generation"' + 'https://www.aclweb.org/anthology/P18-1082.pdf') + parser.add_argument('--top_p', type=float, default=-1.0, + help='Multinomial sampling with topp, ' + 'see [ICLR2020] "The Curious Case of Neural Text Degeneration"' + 'https://arxiv.org/abs/1904.09751') + parser.add_argument('--gpu', type=int, default=0, + help='Which gpu to use, set None to use cpu') + return parser.parse_args() + + +class GPT2Decoder(BaseStepDecoder): + def __init__(self, gpt2_lm_model): + self._gpt2_lm_model = gpt2_lm_model + @property + def state_batch_axis(self): + return 2 if self._gpt2_lm_model._backbone_model.layout == 'NT' else 3 + def init_states(self, batch_size, ctx): + return self._gpt2_lm_model.init_states(batch_size, ctx) + def __call__(self, data, states): + data = mx.npx.reshape(data, (-1, 1)) + logits, new_states = self._gpt2_lm_model(data, states) + return logits[:,-1,:], new_states + + +def sample_gpt2(args): + ctx = mx.gpu(args.gpu) if args.gpu is not None else \ + mx.cpu() + + cfg, tokenizer, _, lm_params_path = get_pretrained_gpt2( + model_name=args.model_name, + load_backbone=False, + load_lm=True) + + if args.length is None: + args.length = cfg.MODEL.max_length + assert args.length <= cfg.MODEL.max_length, \ + "Can't get samples longer than window size: {}".format(cfg.MODEL.max_length) + + model = GPT2ForLM(cfg) + model.hybridize() + model.load_parameters(lm_params_path, ctx=ctx) + gpt2decoder = GPT2Decoder(model) + + sampler = BeamSearchSampler( + beam_size=1, + decoder=gpt2decoder, + eos_id=None, + vocab_size=cfg.MODEL.vocab_size, + max_length_a=0, + max_length_b=args.length, + min_length=1, + temperature=args.temperature, + sampling=True, + sampling_topp=args.top_p, + sampling_topk=args.top_k, + early_return=False + ) + start_states = gpt2decoder.init_states(args.batch_size, ctx) + + while True: + raw_text = input('Model prompt >>> ') + while not raw_text: + print('Prompt should not be empty!') + raw_text = input("Model prompt >>> ") + context_tokens = tokenizer.encode(raw_text) + start_input = mx.np.repeat(mx.np.expand_dims(mx.np.array(context_tokens, ctx), 0), + args.batch_size, + axis=0) + generated = 0 + while generated < args.nsamples: + samples, _, _ = sampler(start_input, start_states) + for i in range(args.batch_size): + generated += 1 + ids = samples[i][0].asnumpy().tolist() + ids = ids[1:ids.index(-1)] if -1 in ids else \ + ids[1:] + text = tokenizer.decode(ids) + print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) + print(text) + print("=" * 80) + +if __name__ == '__main__': + os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round' + args = parse_args() + if args.seed is not None: + np.random.seed(args.seed) + mx.random.seed(args.seed) + random.seed(args.seed) + sample_gpt2(args) + From d7d5ac096ff68444a34ed471d5160515840810db Mon Sep 17 00:00:00 2001 From: Hu Date: Thu, 10 Sep 2020 23:46:15 +0800 Subject: [PATCH 11/22] update --- scripts/generate/README.md | 39 +++++++++++++++++++ .../calculate_metrics.py | 23 +++++++---- .../generate_unconditional_gpt2_samples.py | 5 +-- .../interactive_conditional_gpt2_samples.py | 10 ++--- scripts/genertate/README.md | 33 ---------------- 5 files changed, 60 insertions(+), 50 deletions(-) create mode 100644 scripts/generate/README.md rename scripts/{genertate => generate}/calculate_metrics.py (80%) rename scripts/{genertate => generate}/generate_unconditional_gpt2_samples.py (97%) rename scripts/{genertate => generate}/interactive_conditional_gpt2_samples.py (96%) delete mode 100644 scripts/genertate/README.md diff --git a/scripts/generate/README.md b/scripts/generate/README.md new file mode 100644 index 0000000000..3cdd340b0b --- /dev/null +++ b/scripts/generate/README.md @@ -0,0 +1,39 @@ + +Some of the examples below may include Unicode text characters. Set the environment variable: +```bash +export PYTHONIOENCODING=UTF-8 +``` + +Use the following command to generate gpt2 unconditional samples +```bash +python3 generate_unconditional_gpt2_samples.py \ + --model_name gpt2_774M \ + --nsamples 5000 > samples +``` + +Interactive generate gpt2 conditioanl samples +```bash +python3 interactive_conditional_gpt2_samples.py \ + --model_name gpt2_774M \ + --nsamples 1 +``` + +Calculate some metrics in https://arxiv.org/pdf/1904.09751.pdf. +These metrics are just heuristics and there is no guarantee that they correlates well with human evaluation. +```bash +python3 calculate_metrics.py \ + --file samples +``` + + +Some metrics for the unconditional generated text + +| GPT2 774M | Self-BLEU4 |Zipf Coefficient| Repetition % | +|---------------|----------------|----------------|----------------| +| pure sampling | | | - | +| original gpt2 | | - | - | +| t=0.9 | | - | - | +| topk=40 | | - | - | +| topk=640 | | - | - | +| topk=40 t=0.7 | | - | - | + diff --git a/scripts/genertate/calculate_metrics.py b/scripts/generate/calculate_metrics.py similarity index 80% rename from scripts/genertate/calculate_metrics.py rename to scripts/generate/calculate_metrics.py index 369120b9fe..5003ced0b3 100644 --- a/scripts/genertate/calculate_metrics.py +++ b/scripts/generate/calculate_metrics.py @@ -15,27 +15,33 @@ def parse_args(): parser = argparse.ArgumentParser( description='Calculate metrics for the generated sentences') - parser.add_argument('--generated_file', type=str, required=True, help='Model name') + parser.add_argument('--file', type=str, required=True, help='Model name') parser.add_argument('--num_samples', type=int, default=1000, help='') parser.add_argument('--num_bleu_samples', type=int, default=1000, help='') return parser.parse_args() -def calculate_self_bleu4(samples, num_bleu_samples): - sys_indices = random.sample(range(len(samples)), num_bleu_samples) +def calculate_self_bleu4(sample_ids, num_bleu_samples): + """Self- BLEU is calculated by computing the BLEU score of each generated document + using all other generations in the evaluation set as references. + """ + sys_indices = random.sample(range(len(sample_ids)), num_bleu_samples) res = 0 for sys_indice in sys_indices: # remove it self - ref = samples[:sys_indice] + samples[sys_indice+1:] + ref = sample_ids[:sys_indice] + sample_ids[sys_indice+1:] sacrebleu_out = sacrebleu.corpus_bleu( - sys_stream=samples[sys_indice], + sys_stream=sample_ids[sys_indice], ref_streams=ref) res += sacrebleu_out.score - res /= len(samples) + res /= len(sample_ids) return res def calculate_zipf_coefficient(sample_ids, tokenizer): + """The Zipfian coefficient s can be used to compare the distribution in a given + text to a theoretically perfect exponential curve. + """ cnt = Counter() for sample_id in sample_ids: cnt.update(sample_id) @@ -47,6 +53,8 @@ def calculate_zipf_coefficient(sample_ids, tokenizer): def calculate_repetition(sample_ids): + """ + """ max_n = 90 res = 0 for sample_id in sample_ids: @@ -82,7 +90,7 @@ def calculate_metrics(args): vocab_file=local_paths['vocab']) sample_ids = tokenizer.encode(samples, output_type=int) - self_bleu4 = calculate_self_bleu4(samples, args.num_bleu_samples) + self_bleu4 = calculate_self_bleu4(sample_ids, args.num_bleu_samples) zipf_coefficient = calculate_zipf_coefficient(sample_ids, tokenizer) repetition = calculate_repetition(sample_ids) print('Self BLEU 4: {}\n' @@ -94,4 +102,3 @@ def calculate_metrics(args): if __name__ == '__main__': args = parse_args() calculate_metrics(args) - \ No newline at end of file diff --git a/scripts/genertate/generate_unconditional_gpt2_samples.py b/scripts/generate/generate_unconditional_gpt2_samples.py similarity index 97% rename from scripts/genertate/generate_unconditional_gpt2_samples.py rename to scripts/generate/generate_unconditional_gpt2_samples.py index e16214ab42..66f65763a2 100644 --- a/scripts/genertate/generate_unconditional_gpt2_samples.py +++ b/scripts/generate/generate_unconditional_gpt2_samples.py @@ -3,6 +3,7 @@ import os import mxnet as mx import argparse +from gluonnlp.utils import set_seed from gluonnlp.sequence_sampler import BeamSearchSampler, BaseStepDecoder from gluonnlp.models.gpt2 import GPT2ForLM, list_pretrained_gpt2, get_pretrained_gpt2 @@ -102,7 +103,5 @@ def sample_gpt2(args): os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round' args = parse_args() if args.seed is not None: - np.random.seed(args.seed) - mx.random.seed(args.seed) - random.seed(args.seed) + set_seed(args.seed) sample_gpt2(args) diff --git a/scripts/genertate/interactive_conditional_gpt2_samples.py b/scripts/generate/interactive_conditional_gpt2_samples.py similarity index 96% rename from scripts/genertate/interactive_conditional_gpt2_samples.py rename to scripts/generate/interactive_conditional_gpt2_samples.py index ce1fd603a8..0c875b9249 100644 --- a/scripts/genertate/interactive_conditional_gpt2_samples.py +++ b/scripts/generate/interactive_conditional_gpt2_samples.py @@ -3,6 +3,7 @@ import os import mxnet as mx import argparse +from gluonnlp.utils import set_seed from gluonnlp.sequence_sampler import BeamSearchSampler, BaseStepDecoder from gluonnlp.models.gpt2 import GPT2ForLM, list_pretrained_gpt2, get_pretrained_gpt2 @@ -88,8 +89,8 @@ def sample_gpt2(args): while not raw_text: print('Prompt should not be empty!') raw_text = input("Model prompt >>> ") - context_tokens = tokenizer.encode(raw_text) - start_input = mx.np.repeat(mx.np.expand_dims(mx.np.array(context_tokens, ctx), 0), + context_tokens = tokenizer.encode(raw_text, output_type=int) + start_input = mx.np.repeat(mx.np.expand_dims(mx.np.array(context_tokens, ctx=ctx), 0), args.batch_size, axis=0) generated = 0 @@ -109,8 +110,5 @@ def sample_gpt2(args): os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round' args = parse_args() if args.seed is not None: - np.random.seed(args.seed) - mx.random.seed(args.seed) - random.seed(args.seed) + set_seed(args.seed) sample_gpt2(args) - diff --git a/scripts/genertate/README.md b/scripts/genertate/README.md deleted file mode 100644 index 834d16e688..0000000000 --- a/scripts/genertate/README.md +++ /dev/null @@ -1,33 +0,0 @@ - -Some of the examples below may include Unicode text characters. Set the environment variable: -```bash -export PYTHONIOENCODING=UTF-8 -``` - -Use the following command to generate gpt2 samples -```bash -python3 generate_unconditional_gpt2_samples.py \ - --model_name gpt2_774M \ - --nsamples 5000 > samples -``` - -```bash -python3 interactive_conditional_gpt2_samples.py \ - --model_name gpt2_774M \ - --nsamples 1 -``` - -```bash -python3 calculate_metrics.py samples -``` - -# 贴点sample - -Some metrics for the unconditional generated text - -| GPT2 774M | Perplexity | Self-BLEU4 |Zipf Coefficient| Repetition % | -|---------------|--------------|----------------|----------------|----------------| -| openai | | | - | - | -| gluon | | - | - | - | -| | | - | - | - | - From e2adfd2b1265704a0929b1650c556a04fea5211a Mon Sep 17 00:00:00 2001 From: Hu Date: Thu, 10 Sep 2020 23:50:47 +0800 Subject: [PATCH 12/22] update --- scripts/generate/calculate_metrics.py | 19 ++++--------------- .../models/model_zoo_checksums/gpt2.txt | 4 ---- 2 files changed, 4 insertions(+), 19 deletions(-) diff --git a/scripts/generate/calculate_metrics.py b/scripts/generate/calculate_metrics.py index 5003ced0b3..605c514d40 100644 --- a/scripts/generate/calculate_metrics.py +++ b/scripts/generate/calculate_metrics.py @@ -5,11 +5,8 @@ import operator import numpy as np from scipy import stats -import os import random -from gluonnlp.base import get_model_zoo_home_dir, get_repo_model_zoo_url, get_model_zoo_checksum_dir -from gluonnlp.utils.misc import load_checksum_stats, download -from gluonnlp.data.tokenizers import HuggingFaceByteBPETokenizer +from gluonnlp.models.gpt2 import get_pretrained_gpt2 def parse_args(): @@ -77,17 +74,9 @@ def calculate_metrics(args): samples = samples[:args.num_samples] assert len(samples) == args.num_samples - local_paths = {} - download_jobs = [('vocab', 'gpt2_124M/gpt2-9dc62091.vocab'), - ('merges', 'gpt2_124M/gpt2-396d4d8e.merges')] - FILE_STATS = load_checksum_stats(os.path.join(get_model_zoo_checksum_dir(), 'gpt2.txt')) - for k, path in download_jobs: - local_paths[k] = download(url=get_repo_model_zoo_url() + path, - path=os.path.join(get_model_zoo_home_dir(), path), - sha1_hash=FILE_STATS[path]) - tokenizer = HuggingFaceByteBPETokenizer( - merges_file=local_paths['merges'], - vocab_file=local_paths['vocab']) + _, _, tokenizer, _, _ = get_pretrained_gpt2( + load_backbone=False, + load_lm=False) sample_ids = tokenizer.encode(samples, output_type=int) self_bleu4 = calculate_self_bleu4(sample_ids, args.num_bleu_samples) diff --git a/src/gluonnlp/models/model_zoo_checksums/gpt2.txt b/src/gluonnlp/models/model_zoo_checksums/gpt2.txt index a315a622e6..0f4af681fa 100644 --- a/src/gluonnlp/models/model_zoo_checksums/gpt2.txt +++ b/src/gluonnlp/models/model_zoo_checksums/gpt2.txt @@ -1,19 +1,15 @@ -gpt2_124M/model-fac1f39c.yml fac1f39c804e324c69162b9b37bd24ab98241612 424 gpt2_124M/model_lm-99b90604.params 99b9060488b4542ccd045c28401da10a3158ca80 497771820 gpt2_124M/gpt2-396d4d8e.merges 396d4d8ec90cb02f4d56e049e0e4add868bcd943 456318 gpt2_124M/gpt2-9dc62091.vocab 9dc620913410d5ec1a988abf852891e1c9f0f649 558055 gpt2_124M/model-bfed311d.params bfed311d5c980ba475f90ccf7f536d25c3b40386 497769466 -gpt2_355M/model-2aea05ff.yml 2aea05ff1e67ef816b3f824102da8b7b1292a620 425 gpt2_355M/model_lm-eed0e964.params eed0e964f4222823a557acfee2c106f228ce0188 1419317644 gpt2_355M/gpt2-396d4d8e.merges 396d4d8ec90cb02f4d56e049e0e4add868bcd943 456318 gpt2_355M/gpt2-9dc62091.vocab 9dc620913410d5ec1a988abf852891e1c9f0f649 558055 gpt2_355M/model-81dee612.params 81dee612413733899f6e5fbbeac91da781805e1b 1419312986 -gpt2_774M/model-c9555788.yml c95557880783ec4f94b09b5b045c8d9e9a198e4d 425 gpt2_774M/model_lm-cfbfa641.params cfbfa6419aaf1eae480fba5a1a7c8ea6096d43d6 3096157676 gpt2_774M/gpt2-396d4d8e.merges 396d4d8ec90cb02f4d56e049e0e4add868bcd943 456318 gpt2_774M/gpt2-9dc62091.vocab 9dc620913410d5ec1a988abf852891e1c9f0f649 558055 gpt2_774M/model-9917e24e.params 9917e24e89c651793adea69042d6cceddfc7973c 3096150714 -gpt2_1558M/model-3f8e3175.yml 3f8e3175b9faad1cc95df2f542b486698aeb5665 425 gpt2_1558M/model_lm-c8489dcb.params c8489dcbdb0d39bc3eac6d1d62e0e3dace9faa8f 6230494540 gpt2_1558M/gpt2-396d4d8e.merges 396d4d8ec90cb02f4d56e049e0e4add868bcd943 456318 gpt2_1558M/gpt2-9dc62091.vocab 9dc620913410d5ec1a988abf852891e1c9f0f649 558055 From 7854a9bcf35fcd5e0e228806197c31db9b99d8fb Mon Sep 17 00:00:00 2001 From: Hu Date: Fri, 11 Sep 2020 14:11:47 +0800 Subject: [PATCH 13/22] update --- scripts/generate/README.md | 5 ++- scripts/generate/calculate_metrics.py | 33 ++++++++++--------- .../generate_unconditional_gpt2_samples.py | 2 -- .../interactive_conditional_gpt2_samples.py | 2 -- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/scripts/generate/README.md b/scripts/generate/README.md index 3cdd340b0b..136ff1a3fa 100644 --- a/scripts/generate/README.md +++ b/scripts/generate/README.md @@ -8,7 +8,10 @@ Use the following command to generate gpt2 unconditional samples ```bash python3 generate_unconditional_gpt2_samples.py \ --model_name gpt2_774M \ - --nsamples 5000 > samples + --gpu 0 \ + --temperature 0.7 \ + --top_k 40 \ + --nsamples 1000 > samples ``` Interactive generate gpt2 conditioanl samples diff --git a/scripts/generate/calculate_metrics.py b/scripts/generate/calculate_metrics.py index 605c514d40..ab06281c5e 100644 --- a/scripts/generate/calculate_metrics.py +++ b/scripts/generate/calculate_metrics.py @@ -1,6 +1,6 @@ import re import argparse -import sacrebleu +from nltk.translate.bleu_score import sentence_bleu from collections import Counter import operator import numpy as np @@ -18,20 +18,18 @@ def parse_args(): return parser.parse_args() -def calculate_self_bleu4(sample_ids, num_bleu_samples): +def calculate_self_bleu4(samples, num_bleu_samples): """Self- BLEU is calculated by computing the BLEU score of each generated document using all other generations in the evaluation set as references. """ - sys_indices = random.sample(range(len(sample_ids)), num_bleu_samples) + sys_indices = random.sample(range(len(samples)), num_bleu_samples) res = 0 for sys_indice in sys_indices: - # remove it self - ref = sample_ids[:sys_indice] + sample_ids[sys_indice+1:] - sacrebleu_out = sacrebleu.corpus_bleu( - sys_stream=sample_ids[sys_indice], - ref_streams=ref) - res += sacrebleu_out.score - res /= len(sample_ids) + res += sentence_bleu( + hypothesis=samples[sys_indice], + references=samples[:sys_indice] + samples[sys_indice+1:], + weights=(0.25, 0.25, 0.25, 0.25)) + res /= len(samples) return res @@ -50,10 +48,10 @@ def calculate_zipf_coefficient(sample_ids, tokenizer): def calculate_repetition(sample_ids): - """ + """The repetition rate in generated samples. """ max_n = 90 - res = 0 + n_repeated_examples = 0 for sample_id in sample_ids: rev = list(reversed(sample_id)) last_n_repeats = [0 for _ in range(max_n)] @@ -62,9 +60,12 @@ def calculate_repetition(sample_ids): while len(rev[n*n_repeat:n*(n_repeat+1)]) == n and \ rev[n*n_repeat:n*(n_repeat+1)] == rev[:n]: n_repeat += 1 - last_n_repeat[n-1] = n_repeat -# res += (sum(last_n_repeat) / ) TODO - + last_n_repeats[n-1] = n_repeat + max_repeated_n = max(range(max_n), key=lambda x: last_n_repeats[x]) + if last_n_repeats[max_repeated_n] > 1 and (max_repeated_n+1 >= 3 or last_n_repeats[max_repeated_n] > 50): + n_repeated_examples += 1 + return n_repeated_examples / len(sample_ids) + def calculate_metrics(args): with open(args.generated_file, encoding='utf-8') as of: @@ -79,7 +80,7 @@ def calculate_metrics(args): load_lm=False) sample_ids = tokenizer.encode(samples, output_type=int) - self_bleu4 = calculate_self_bleu4(sample_ids, args.num_bleu_samples) + self_bleu4 = calculate_self_bleu4(samples, args.num_bleu_samples) zipf_coefficient = calculate_zipf_coefficient(sample_ids, tokenizer) repetition = calculate_repetition(sample_ids) print('Self BLEU 4: {}\n' diff --git a/scripts/generate/generate_unconditional_gpt2_samples.py b/scripts/generate/generate_unconditional_gpt2_samples.py index 66f65763a2..aa7cf3ecab 100644 --- a/scripts/generate/generate_unconditional_gpt2_samples.py +++ b/scripts/generate/generate_unconditional_gpt2_samples.py @@ -1,5 +1,3 @@ -import numpy as np -import random import os import mxnet as mx import argparse diff --git a/scripts/generate/interactive_conditional_gpt2_samples.py b/scripts/generate/interactive_conditional_gpt2_samples.py index 0c875b9249..03102a6f86 100644 --- a/scripts/generate/interactive_conditional_gpt2_samples.py +++ b/scripts/generate/interactive_conditional_gpt2_samples.py @@ -1,5 +1,3 @@ -import numpy as np -import random import os import mxnet as mx import argparse From 260d74b85a3dd39cd1b94e32f10ab9e1b3f26f07 Mon Sep 17 00:00:00 2001 From: Hu Date: Fri, 11 Sep 2020 14:52:07 +0800 Subject: [PATCH 14/22] update --- scripts/generate/calculate_metrics.py | 37 ++++++++++++++++++--------- 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/scripts/generate/calculate_metrics.py b/scripts/generate/calculate_metrics.py index ab06281c5e..2ad7c1afc2 100644 --- a/scripts/generate/calculate_metrics.py +++ b/scripts/generate/calculate_metrics.py @@ -1,3 +1,4 @@ +import os import re import argparse from nltk.translate.bleu_score import sentence_bleu @@ -6,6 +7,9 @@ import numpy as np from scipy import stats import random +import tqdm +from functools import partial +from multiprocessing.pool import Pool from gluonnlp.models.gpt2 import get_pretrained_gpt2 @@ -19,18 +23,27 @@ def parse_args(): def calculate_self_bleu4(samples, num_bleu_samples): - """Self- BLEU is calculated by computing the BLEU score of each generated document + """Self-BLEU is calculated by computing the BLEU score of each generated document using all other generations in the evaluation set as references. """ - sys_indices = random.sample(range(len(samples)), num_bleu_samples) - res = 0 - for sys_indice in sys_indices: - res += sentence_bleu( - hypothesis=samples[sys_indice], - references=samples[:sys_indice] + samples[sys_indice+1:], - weights=(0.25, 0.25, 0.25, 0.25)) - res /= len(samples) - return res + def bleu(samples, i): + return sentence_bleu( + hypothesis=samples[i], + references=samples[:i] + samples[i+1:], + weights=(0.25, 0.25, 0.25, 0.25) + ) + + bleu_scores = [] + pool = Pool(processes=os.cpu_count()) + bleu_scores.append( + list(tqdm( + pool.imap_unordered( + partial(bleu, samples), + random.sample(range(len(samples)), num_bleu_samples)), + total=num_bleu_samples + )) + ) + return sum(bleu_scores) / num_bleu_samples def calculate_zipf_coefficient(sample_ids, tokenizer): @@ -68,14 +81,14 @@ def calculate_repetition(sample_ids): def calculate_metrics(args): - with open(args.generated_file, encoding='utf-8') as of: + with open(args.file, encoding='utf-8') as of: samples = of.read() pattern = '='*40 + ' SAMPLE \d+ ' + '='*40 + '\n' samples = re.split(pattern, samples)[1:] samples = samples[:args.num_samples] assert len(samples) == args.num_samples - _, _, tokenizer, _, _ = get_pretrained_gpt2( + _, tokenizer, _, _ = get_pretrained_gpt2( load_backbone=False, load_lm=False) sample_ids = tokenizer.encode(samples, output_type=int) From be64347fc350a2e17e93dcd8650ecd6f8e9f34ad Mon Sep 17 00:00:00 2001 From: Hu Date: Fri, 11 Sep 2020 14:57:28 +0800 Subject: [PATCH 15/22] update --- scripts/generate/calculate_metrics.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/scripts/generate/calculate_metrics.py b/scripts/generate/calculate_metrics.py index 2ad7c1afc2..14d8bd77ff 100644 --- a/scripts/generate/calculate_metrics.py +++ b/scripts/generate/calculate_metrics.py @@ -22,17 +22,18 @@ def parse_args(): return parser.parse_args() +def bleu(samples, i): + return sentence_bleu( + hypothesis=samples[i], + references=samples[:i] + samples[i+1:], + weights=(0.25, 0.25, 0.25, 0.25) + ) + + def calculate_self_bleu4(samples, num_bleu_samples): """Self-BLEU is calculated by computing the BLEU score of each generated document using all other generations in the evaluation set as references. """ - def bleu(samples, i): - return sentence_bleu( - hypothesis=samples[i], - references=samples[:i] + samples[i+1:], - weights=(0.25, 0.25, 0.25, 0.25) - ) - bleu_scores = [] pool = Pool(processes=os.cpu_count()) bleu_scores.append( From 57797c2b86a4fbc4db319ea983624756aa4ba6d6 Mon Sep 17 00:00:00 2001 From: Hu Date: Fri, 11 Sep 2020 15:43:48 +0800 Subject: [PATCH 16/22] update --- scripts/generate/calculate_metrics.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/scripts/generate/calculate_metrics.py b/scripts/generate/calculate_metrics.py index 14d8bd77ff..dd7cc011ce 100644 --- a/scripts/generate/calculate_metrics.py +++ b/scripts/generate/calculate_metrics.py @@ -7,7 +7,7 @@ import numpy as np from scipy import stats import random -import tqdm +from tqdm import tqdm from functools import partial from multiprocessing.pool import Pool from gluonnlp.models.gpt2 import get_pretrained_gpt2 @@ -22,18 +22,17 @@ def parse_args(): return parser.parse_args() -def bleu(samples, i): - return sentence_bleu( - hypothesis=samples[i], - references=samples[:i] + samples[i+1:], - weights=(0.25, 0.25, 0.25, 0.25) - ) - - def calculate_self_bleu4(samples, num_bleu_samples): """Self-BLEU is calculated by computing the BLEU score of each generated document using all other generations in the evaluation set as references. """ + def bleu(samples, i): + return sentence_bleu( + hypothesis=samples[i], + references=samples[:i] + samples[i+1:], + weights=(0.25, 0.25, 0.25, 0.25) + ) + bleu_scores = [] pool = Pool(processes=os.cpu_count()) bleu_scores.append( From a0ead28fded67595095e5649c7a18ad4b4187ea3 Mon Sep 17 00:00:00 2001 From: Hu Date: Fri, 11 Sep 2020 15:44:57 +0800 Subject: [PATCH 17/22] update --- scripts/generate/calculate_metrics.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/scripts/generate/calculate_metrics.py b/scripts/generate/calculate_metrics.py index dd7cc011ce..e7026bee20 100644 --- a/scripts/generate/calculate_metrics.py +++ b/scripts/generate/calculate_metrics.py @@ -22,17 +22,18 @@ def parse_args(): return parser.parse_args() +def bleu(samples, i): + return sentence_bleu( + hypothesis=samples[i], + references=samples[:i] + samples[i+1:], + weights=(0.25, 0.25, 0.25, 0.25) + ) + + def calculate_self_bleu4(samples, num_bleu_samples): """Self-BLEU is calculated by computing the BLEU score of each generated document using all other generations in the evaluation set as references. """ - def bleu(samples, i): - return sentence_bleu( - hypothesis=samples[i], - references=samples[:i] + samples[i+1:], - weights=(0.25, 0.25, 0.25, 0.25) - ) - bleu_scores = [] pool = Pool(processes=os.cpu_count()) bleu_scores.append( From 42da589db19e3452ed6977c1e874787d9eb168a5 Mon Sep 17 00:00:00 2001 From: Hu Date: Fri, 11 Sep 2020 16:26:47 +0800 Subject: [PATCH 18/22] update --- scripts/generate/calculate_metrics.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/scripts/generate/calculate_metrics.py b/scripts/generate/calculate_metrics.py index e7026bee20..998b76cf48 100644 --- a/scripts/generate/calculate_metrics.py +++ b/scripts/generate/calculate_metrics.py @@ -36,14 +36,12 @@ def calculate_self_bleu4(samples, num_bleu_samples): """ bleu_scores = [] pool = Pool(processes=os.cpu_count()) - bleu_scores.append( - list(tqdm( - pool.imap_unordered( - partial(bleu, samples), - random.sample(range(len(samples)), num_bleu_samples)), - total=num_bleu_samples - )) - ) + bleu_scores = list(tqdm( + pool.imap_unordered( + partial(bleu, samples), + random.sample(range(len(samples)), num_bleu_samples)), + total=num_bleu_samples + )) return sum(bleu_scores) / num_bleu_samples From 887b3b62ac92d7751194cdefa10cabdc5b01ae92 Mon Sep 17 00:00:00 2001 From: Hu Date: Fri, 11 Sep 2020 19:07:01 +0800 Subject: [PATCH 19/22] update --- scripts/generate/README.md | 4 ++++ scripts/generate/calculate_metrics.py | 8 ++++---- scripts/generate/interactive_conditional_gpt2_samples.py | 2 +- src/gluonnlp/sequence_sampler.py | 2 +- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/scripts/generate/README.md b/scripts/generate/README.md index 136ff1a3fa..70846a0e2a 100644 --- a/scripts/generate/README.md +++ b/scripts/generate/README.md @@ -14,6 +14,7 @@ python3 generate_unconditional_gpt2_samples.py \ --nsamples 1000 > samples ``` + Interactive generate gpt2 conditioanl samples ```bash python3 interactive_conditional_gpt2_samples.py \ @@ -40,3 +41,6 @@ Some metrics for the unconditional generated text | topk=640 | | - | - | | topk=40 t=0.7 | | - | - | + +Some interesting generated unconditional samples +# TODO \ No newline at end of file diff --git a/scripts/generate/calculate_metrics.py b/scripts/generate/calculate_metrics.py index 998b76cf48..2809024d5a 100644 --- a/scripts/generate/calculate_metrics.py +++ b/scripts/generate/calculate_metrics.py @@ -46,17 +46,17 @@ def calculate_self_bleu4(samples, num_bleu_samples): def calculate_zipf_coefficient(sample_ids, tokenizer): - """The Zipfian coefficient s can be used to compare the distribution in a given + """The Zipfian coefficient (R-squared) can be used to compare the distribution in a given text to a theoretically perfect exponential curve. """ cnt = Counter() for sample_id in sample_ids: cnt.update(sample_id) - xs = np.arange(1, min(len(cnt), len(tokenizer.vocab))) + xs = np.arange(1, min(len(cnt), len(tokenizer.vocab)) + 1) ys = np.array(sorted(cnt.values(), key=operator.neg)[:len(tokenizer.vocab)]) _, _, r, _, _ = stats.linregress(np.log(xs), np.log(ys)) - return r + return r ** 2 def calculate_repetition(sample_ids): @@ -97,7 +97,7 @@ def calculate_metrics(args): repetition = calculate_repetition(sample_ids) print('Self BLEU 4: {}\n' 'Zipf coefficient: {}\n' - 'Repectition: {}\n' + 'Repetition: {}\n' .format(self_bleu4, zipf_coefficient, repetition)) diff --git a/scripts/generate/interactive_conditional_gpt2_samples.py b/scripts/generate/interactive_conditional_gpt2_samples.py index 03102a6f86..8d3c7c85e0 100644 --- a/scripts/generate/interactive_conditional_gpt2_samples.py +++ b/scripts/generate/interactive_conditional_gpt2_samples.py @@ -42,7 +42,7 @@ def state_batch_axis(self): def init_states(self, batch_size, ctx): return self._gpt2_lm_model.init_states(batch_size, ctx) def __call__(self, data, states): - data = mx.npx.reshape(data, (-1, 1)) + data = mx.npx.reshape(data, (-2, -1)) logits, new_states = self._gpt2_lm_model(data, states) return logits[:,-1,:], new_states diff --git a/src/gluonnlp/sequence_sampler.py b/src/gluonnlp/sequence_sampler.py index 36b98468cf..6ad1f1e035 100644 --- a/src/gluonnlp/sequence_sampler.py +++ b/src/gluonnlp/sequence_sampler.py @@ -577,7 +577,7 @@ def forward(self, inputs, states, src_seq_lengths=None): scores = mx.np.zeros(shape=(batch_size, beam_size), ctx=ctx) if beam_size > 1: scores[:, 1:beam_size] = LARGE_NEGATIVE_FLOAT - samples = step_input.reshape((batch_size, beam_size, 1)) + samples = step_input.reshape((batch_size, beam_size, -1)) batch_shift = mx.np.arange(0, batch_size * beam_size, beam_size, ctx=ctx, dtype=mx.np.int32) step = mx.np.array(0, ctx=ctx, dtype=mx.np.float32) for i in range(max_length): From 0df7c7023f98383c735e12644d2d01ec07fc970e Mon Sep 17 00:00:00 2001 From: Hu Date: Sat, 12 Sep 2020 00:22:47 +0800 Subject: [PATCH 20/22] update --- scripts/generate/calculate_metrics.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/scripts/generate/calculate_metrics.py b/scripts/generate/calculate_metrics.py index 2809024d5a..a18a9479c1 100644 --- a/scripts/generate/calculate_metrics.py +++ b/scripts/generate/calculate_metrics.py @@ -22,15 +22,15 @@ def parse_args(): return parser.parse_args() -def bleu(samples, i): +def bleu(sample_strs, i): return sentence_bleu( - hypothesis=samples[i], - references=samples[:i] + samples[i+1:], + hypothesis=sample_strs[i], + references=sample_strs[:i] + sample_strs[i+1:], weights=(0.25, 0.25, 0.25, 0.25) ) -def calculate_self_bleu4(samples, num_bleu_samples): +def calculate_self_bleu4(sample_strs, num_bleu_samples): """Self-BLEU is calculated by computing the BLEU score of each generated document using all other generations in the evaluation set as references. """ @@ -38,8 +38,8 @@ def calculate_self_bleu4(samples, num_bleu_samples): pool = Pool(processes=os.cpu_count()) bleu_scores = list(tqdm( pool.imap_unordered( - partial(bleu, samples), - random.sample(range(len(samples)), num_bleu_samples)), + partial(bleu, sample_strs), + random.sample(range(len(sample_strs)), num_bleu_samples)), total=num_bleu_samples )) return sum(bleu_scores) / num_bleu_samples @@ -91,8 +91,9 @@ def calculate_metrics(args): load_backbone=False, load_lm=False) sample_ids = tokenizer.encode(samples, output_type=int) + sample_strs = tokenizer.encode(samples, output_type=str) - self_bleu4 = calculate_self_bleu4(samples, args.num_bleu_samples) + self_bleu4 = calculate_self_bleu4(sample_strs, args.num_bleu_samples) zipf_coefficient = calculate_zipf_coefficient(sample_ids, tokenizer) repetition = calculate_repetition(sample_ids) print('Self BLEU 4: {}\n' From df86748d8efc19d8a6a3a157b002b8a1ee8bb329 Mon Sep 17 00:00:00 2001 From: Hu Date: Sat, 12 Sep 2020 12:09:44 +0800 Subject: [PATCH 21/22] update --- scripts/generate/README.md | 56 +++++++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 10 deletions(-) diff --git a/scripts/generate/README.md b/scripts/generate/README.md index 70846a0e2a..b009891d3d 100644 --- a/scripts/generate/README.md +++ b/scripts/generate/README.md @@ -1,4 +1,3 @@ - Some of the examples below may include Unicode text characters. Set the environment variable: ```bash export PYTHONIOENCODING=UTF-8 @@ -15,7 +14,7 @@ python3 generate_unconditional_gpt2_samples.py \ ``` -Interactive generate gpt2 conditioanl samples +Interactively generate gpt2 conditioanl samples ```bash python3 interactive_conditional_gpt2_samples.py \ --model_name gpt2_774M \ @@ -34,13 +33,50 @@ Some metrics for the unconditional generated text | GPT2 774M | Self-BLEU4 |Zipf Coefficient| Repetition % | |---------------|----------------|----------------|----------------| -| pure sampling | | | - | -| original gpt2 | | - | - | -| t=0.9 | | - | - | -| topk=40 | | - | - | -| topk=640 | | - | - | -| topk=40 t=0.7 | | - | - | +| pure sampling | 0.2701 | 0.9522 | 0.0 | +| original gpt2 | 0.2750 | 0.9512 | 0.0 | +| t=0.9 | 0.3683 | 0.9619 | 0.1 | +| topk=40 | 0.4291 | 0.9666 | 0.0 | +| topk=640 | 0.3384 | 0.9623 | 0.0 | +| topk=40 t=0.7 | 0.4621 | 0.9586 | 1.1 | + + +Part of some interesting generated unconditional example + + +A story +``` +Looking back, Dil shook his head. The doll market was growing at an extraordinary rate; in his own opinion, it was unwarranted since his brother was sold to an abandoned bank. He was aware of what he had to do and was sure where his family was going; the thoughts worried him. + +Although his brother had already grown an incredibly bulky gig paperback, he had never read a novel with an arguably more sinister turn. The intellectual gift of a child was reserved for reciting worked examples. As usual, exploiting loopholes, smart brother had practiced the art of overacting. Those tricks that remained medicinal classes grew weaker and smaller; in the end, one could not predict the fruition of those fighting skills. + +Although he knew of a possible method of dealing with the right-winger, although he did not get his brother's hands on it, Regulus had already leaked his intentions in searching for Dil. He had already rushed passengers directly including that stupid bull. Due to the numerous setback, while Dil had luckily survived, he still suffered a decrease in his power. + +He was reminded of the real reason why keeping secrets was not worth nothing; one must develop ones latent talents; in order to reverse one's stage of development all one had to do was give lessons to an opposite-type STUDENT that had similar abilities to those those that were bestowed by the parents; it was thus necessary to sift through the cat and mouse game over the years for those that had true deficiencies. +``` + +Code with comments +``` +struct Read { + +index: usize , + +size: usize , + +} + +extern crate glob; + +/// A function indexed by some unique index (The &uniqueID Thing will become the +/// @value@). -Some interesting generated unconditional samples -# TODO \ No newline at end of file +struct Parse { + +index: usize , + +index64: usize , + +} + +``` From d84f53f38fc520e3c2a60c94a48ef4e1d8127079 Mon Sep 17 00:00:00 2001 From: Hu Date: Sat, 12 Sep 2020 22:20:36 +0800 Subject: [PATCH 22/22] update --- scripts/generate/calculate_metrics.py | 2 ++ tests/test_models_gpt2.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/generate/calculate_metrics.py b/scripts/generate/calculate_metrics.py index a18a9479c1..72b7b74c36 100644 --- a/scripts/generate/calculate_metrics.py +++ b/scripts/generate/calculate_metrics.py @@ -91,6 +91,8 @@ def calculate_metrics(args): load_backbone=False, load_lm=False) sample_ids = tokenizer.encode(samples, output_type=int) + if sample_ids[-1] == tokenizer.vocab.eos_id: + sample_ids.pop() sample_strs = tokenizer.encode(samples, output_type=str) self_bleu4 = calculate_self_bleu4(sample_strs, args.num_bleu_samples) diff --git a/tests/test_models_gpt2.py b/tests/test_models_gpt2.py index 8de52efb16..1b510ba332 100644 --- a/tests/test_models_gpt2.py +++ b/tests/test_models_gpt2.py @@ -107,7 +107,7 @@ def test_gpt2_incremental_states(ctx): @pytest.mark.slow @pytest.mark.remote_required -@pytest.mark.parametrize('model_name', list_pretrained_gpt2()) +@pytest.mark.parametrize('model_name', ['gpt2_124M', 'gpt2_355M', 'gpt2_774M']) def test_gpt2(model_name, ctx): # test from pretrained assert len(list_pretrained_gpt2()) > 0