dmlc · sxjscience · Sep 12, 2020 · Sep 3, 2020 · Sep 3, 2020 · Sep 3, 2020
@@ -170,8 +170,8 @@ def test_model(tf_model_path, gluon_model):
 
     # gluon model
     gl_input_ids = mx.np.array(input_ids, dtype=np.int32, ctx=ctx)
-    gl_logits_1, gl_states = gluon_model(gl_input_ids, gl_start_states, mx.np.array(0, dtype=np.int32, ctx=ctx))
-    gl_logits_2, _ = gluon_model(gl_input_ids, gl_states, mx.np.array(seq_length, dtype=np.int32, ctx=ctx))
+    gl_logits_1, gl_states = gluon_model(gl_input_ids, gl_start_states)
+    gl_logits_2, _ = gluon_model(gl_input_ids, gl_states)
 
     # tf model
     with tf.Session(graph=tf.Graph()) as sess:    

@@ -1,6 +1,6 @@
 python3 -m pip install tensorflow==1.15 --upgrade --user
 git clone https://github.com/openai/gpt-2.git gpt_2
-for model in 124M 355M 774M
+for model in 124M 355M 774M 1558M
 do
     python3 gpt_2/download_model.py ${model}
     mkdir gpt2_${model}

@@ -0,0 +1,82 @@
+Some of the examples below may include Unicode text characters. Set the environment variable:
+```bash
+export PYTHONIOENCODING=UTF-8
+```
+
+Use the following command to generate gpt2 unconditional samples
+```bash
+python3 generate_unconditional_gpt2_samples.py \
+    --model_name gpt2_774M \
+    --gpu 0 \
+    --temperature 0.7 \
+    --top_k 40 \
+    --nsamples 1000 > samples
+```
+
+
+Interactively generate gpt2 conditioanl samples
+```bash
+python3 interactive_conditional_gpt2_samples.py \
+    --model_name gpt2_774M \
+    --nsamples 1
+```
+
+Calculate some metrics in https://arxiv.org/pdf/1904.09751.pdf. 
+These metrics are just heuristics and there is no guarantee that they correlates well with human evaluation.
+```bash
+python3 calculate_metrics.py \
+    --file samples
+```
+
+
+Some metrics for the unconditional generated text
+
+|   GPT2 774M   |   Self-BLEU4   |Zipf Coefficient|  Repetition %  |
+|---------------|----------------|----------------|----------------|
+| pure sampling | 0.2701         | 0.9522         | 0.0            |
+| original gpt2 | 0.2750         | 0.9512         | 0.0            |
+| t=0.9         | 0.3683         | 0.9619         | 0.1            |
+| topk=40       | 0.4291         | 0.9666         | 0.0            |
+| topk=640      | 0.3384         | 0.9623         | 0.0            |
+| topk=40 t=0.7 | 0.4621         | 0.9586         | 1.1            |
+
+
+Part of some interesting generated unconditional example
+
+
+A story
+```
+Looking back, Dil shook his head. The doll market was growing at an extraordinary rate; in his own opinion, it was unwarranted since his brother was sold to an abandoned bank. He was aware of what he had to do and was sure where his family was going; the thoughts worried him.
+
+Although his brother had already grown an incredibly bulky gig paperback, he had never read a novel with an arguably more sinister turn. The intellectual gift of a child was reserved for reciting worked examples. As usual, exploiting loopholes, smart brother had practiced the art of overacting. Those tricks that remained medicinal classes grew weaker and smaller; in the end, one could not predict the fruition of those fighting skills.
+
+Although he knew of a possible method of dealing with the right-winger, although he did not get his brother's hands on it, Regulus had already leaked his intentions in searching for Dil. He had already rushed passengers directly including that stupid bull. Due to the numerous setback, while Dil had luckily survived, he still suffered a decrease in his power.
+
+He was reminded of the real reason why keeping secrets was not worth nothing; one must develop ones latent talents; in order to reverse one's stage of development all one had to do was give lessons to an opposite-type STUDENT that had similar abilities to those those that were bestowed by the parents; it was thus necessary to sift through the cat and mouse game over the years for those that had true deficiencies.
+```
+
+Code with comments
+```
+struct Read <T> {
+
+index: usize ,
+
+size: usize ,
+
+}
+
+extern crate glob;
+
+/// A function indexed by some unique index (The &uniqueID Thing will become the
+
+/// @value@).
+
+struct Parse <T: Read, U: Read, D: Read, Tore: Read> {
+
+index: usize ,
+
+index64: usize ,
+
+}
+
+```
@@ -0,0 +1,109 @@
+import os
+import re
+import argparse
+from nltk.translate.bleu_score import sentence_bleu
+from collections import Counter
+import operator
+import numpy as np
+from scipy import stats
+import random
+from tqdm import tqdm
+from functools import partial
+from multiprocessing.pool import Pool
+from gluonnlp.models.gpt2 import get_pretrained_gpt2
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Calculate metrics for the generated sentences')
+    parser.add_argument('--file', type=str, required=True, help='Model name')
+    parser.add_argument('--num_samples', type=int, default=1000, help='')
+    parser.add_argument('--num_bleu_samples', type=int, default=1000, help='')
+    return parser.parse_args()
+
+
+def bleu(sample_strs, i):
+    return sentence_bleu(
+        hypothesis=sample_strs[i],
+        references=sample_strs[:i] + sample_strs[i+1:],
+        weights=(0.25, 0.25, 0.25, 0.25)
+    )
+
+
+def calculate_self_bleu4(sample_strs, num_bleu_samples):
+    """Self-BLEU is calculated by computing the BLEU score of each generated document
+    using all other generations in the evaluation set as references.
+    """
+    bleu_scores = []
+    pool = Pool(processes=os.cpu_count())
+    bleu_scores = list(tqdm(
+        pool.imap_unordered(
+            partial(bleu, sample_strs),
+            random.sample(range(len(sample_strs)), num_bleu_samples)),
+        total=num_bleu_samples
+    ))
+    return sum(bleu_scores) / num_bleu_samples
+
+
+def calculate_zipf_coefficient(sample_ids, tokenizer):
+    """The Zipfian coefficient (R-squared) can be used to compare the distribution in a given
+    text to a theoretically perfect exponential curve.
+    """
+    cnt = Counter()
+    for sample_id in sample_ids:
+        cnt.update(sample_id)
+
+    xs = np.arange(1, min(len(cnt), len(tokenizer.vocab)) + 1)
+    ys = np.array(sorted(cnt.values(), key=operator.neg)[:len(tokenizer.vocab)])
+    _, _, r, _, _ = stats.linregress(np.log(xs), np.log(ys))
+    return r ** 2
+
+
+def calculate_repetition(sample_ids):
+    """The repetition rate in generated samples.
+    """
+    max_n = 90
+    n_repeated_examples = 0
+    for sample_id in sample_ids:
+        rev = list(reversed(sample_id))
+        last_n_repeats = [0 for _ in range(max_n)]
+        for n in range(1, max_n + 1):
+            n_repeat = 1
+            while len(rev[n*n_repeat:n*(n_repeat+1)]) == n and \
+                  rev[n*n_repeat:n*(n_repeat+1)] == rev[:n]:
+                n_repeat += 1
+            last_n_repeats[n-1] = n_repeat
+        max_repeated_n = max(range(max_n), key=lambda x: last_n_repeats[x])
+        if last_n_repeats[max_repeated_n] > 1 and (max_repeated_n+1 >= 3 or last_n_repeats[max_repeated_n] > 50):
+            n_repeated_examples += 1
+    return n_repeated_examples / len(sample_ids)
+
+
+def calculate_metrics(args):
+    with open(args.file, encoding='utf-8') as of:
+        samples = of.read()
+    pattern = '='*40 + ' SAMPLE \d+ ' + '='*40 + '\n'
+    samples = re.split(pattern, samples)[1:]
+    samples = samples[:args.num_samples]
+    assert len(samples) == args.num_samples
+
+    _, tokenizer, _, _ = get_pretrained_gpt2(
+        load_backbone=False,
+        load_lm=False)
+    sample_ids = tokenizer.encode(samples, output_type=int)
+    if sample_ids[-1] == tokenizer.vocab.eos_id:
+        sample_ids.pop()
+    sample_strs = tokenizer.encode(samples, output_type=str)
+
+    self_bleu4 = calculate_self_bleu4(sample_strs, args.num_bleu_samples)
+    zipf_coefficient = calculate_zipf_coefficient(sample_ids, tokenizer)
+    repetition = calculate_repetition(sample_ids)
+    print('Self BLEU 4: {}\n'
+          'Zipf coefficient: {}\n'
+          'Repetition: {}\n'
+          .format(self_bleu4, zipf_coefficient, repetition))
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    calculate_metrics(args)
@@ -0,0 +1,105 @@
+import os
+import mxnet as mx
+import argparse
+from gluonnlp.utils import set_seed
+from gluonnlp.sequence_sampler import BeamSearchSampler, BaseStepDecoder
+from gluonnlp.models.gpt2 import GPT2ForLM, list_pretrained_gpt2, get_pretrained_gpt2
+
+mx.npx.set_np()
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='GPT-2 unconditional sampler. Load a GPT-2 model and sample.')
+    parser.add_argument('--model_name', type=str, default='gpt2_124M',
+                        choices=list_pretrained_gpt2(), help='Model name')
+    parser.add_argument('--seed', type=int, default=None, help='The random seed')
+    parser.add_argument('--nsamples', type=int, default=0, help='Number of samples to return')
+    parser.add_argument('--batch_size', type=int, default=1, help='Number of batches')
+    parser.add_argument('--length', type=int, default=None,
+                        help='Number of tokens in generated text, if None (default), is '
+                             'determined by model max_length')
+    parser.add_argument('--temperature', type=float, default=1.0,
+                        help='')
+    parser.add_argument('--top_k', type=int, default=-1,
+                        help='Multinomial sampling with topk, '
+                            'see [ACL2018] "Hierarchical Neural Story Generation"'
+                            'https://www.aclweb.org/anthology/P18-1082.pdf')
+    parser.add_argument('--top_p', type=float, default=-1.0,
+                        help='Multinomial sampling with topp, '
+                             'see [ICLR2020] "The Curious Case of Neural Text Degeneration"'
+                             'https://arxiv.org/abs/1904.09751')
+    parser.add_argument('--gpu', type=int, default=0,
+                        help='Which gpu to use, set None to use cpu')
+    return parser.parse_args()
+
+
+class GPT2Decoder(BaseStepDecoder):
+    def __init__(self, gpt2_lm_model):
+        self._gpt2_lm_model = gpt2_lm_model
+    @property
+    def state_batch_axis(self):
+        return 2 if self._gpt2_lm_model._backbone_model.layout == 'NT' else 3
+    def init_states(self, batch_size, ctx):
+        return self._gpt2_lm_model.init_states(batch_size, ctx)
+    def __call__(self, data, states):
+        data = mx.npx.reshape(data, (-1, 1))
+        logits, new_states = self._gpt2_lm_model(data, states)
+        return logits[:,-1,:], new_states
+
+
+def sample_gpt2(args):
+    ctx = mx.gpu(args.gpu) if args.gpu is not None else \
+          mx.cpu()
+
+    cfg, tokenizer, _, lm_params_path = get_pretrained_gpt2(
+        model_name=args.model_name,
+        load_backbone=False,
+        load_lm=True)
+
+    if args.length is None:
+        args.length = cfg.MODEL.max_length
+    assert args.length <= cfg.MODEL.max_length, \
+           "Can't get samples longer than window size: {}".format(cfg.MODEL.max_length)
+
+    model = GPT2ForLM(cfg)
+    model.hybridize()
+    model.load_parameters(lm_params_path, ctx=ctx)
+    gpt2decoder = GPT2Decoder(model)
+
+    sampler = BeamSearchSampler(
+        beam_size=1,
+        decoder=gpt2decoder,
+        eos_id=None,
+        vocab_size=cfg.MODEL.vocab_size,
+        max_length_a=0,
+        max_length_b=args.length,
+        min_length=1,
+        temperature=args.temperature,
+        sampling=True,
+        sampling_topp=args.top_p,
+        sampling_topk=args.top_k,
+        early_return=False
+    )
+
+    start_input = mx.np.full((args.batch_size, 1), tokenizer.vocab.eos_id, ctx=ctx)
+    start_states = gpt2decoder.init_states(args.batch_size, ctx)
+
+    generated = 0
+    while args.nsamples <= 0 or generated < args.nsamples:
+        samples, _, _ = sampler(start_input, start_states)
+        for i in range(args.batch_size):
+            generated += 1
+            ids = samples[i][0].asnumpy().tolist()
+            ids = ids[1:ids.index(-1)] if -1 in ids else \
+                  ids[1:]
+            text = tokenizer.decode(ids)
+            print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
+            print(text)
+
+
+if __name__ == '__main__':
+    os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round'
+    args = parse_args()
+    if args.seed is not None:
+        set_seed(args.seed)
+    sample_gpt2(args)